5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 /* At least one required character in the target string is expressible only in
42 const char* const non_utf8_target_but_utf8_required
43 = "Can't match, because target string needs to be in UTF-8\n";
46 * pregcomp and pregexec -- regsub and regerror are not used in perl
48 * Copyright (c) 1986 by University of Toronto.
49 * Written by Henry Spencer. Not derived from licensed software.
51 * Permission is granted to anyone to use this software for any
52 * purpose on any computer system, and to redistribute it freely,
53 * subject to the following restrictions:
55 * 1. The author is not responsible for the consequences of use of
56 * this software, no matter how awful, even if they arise
59 * 2. The origin of this software must not be misrepresented, either
60 * by explicit claim or by omission.
62 * 3. Altered versions must be plainly marked as such, and must not
63 * be misrepresented as being the original software.
65 **** Alterations to Henry's code are...
67 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
68 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
69 **** by Larry Wall and others
71 **** You may distribute under the terms of either the GNU General Public
72 **** License or the Artistic License, as specified in the README file.
74 * Beware that some of this code is subtly aware of the way operator
75 * precedence is structured in regular expressions. Serious changes in
76 * regular-expression syntax might require a total rethink.
79 #define PERL_IN_REGEXEC_C
83 #ifdef PERL_IN_XSUB_RE
89 #include "inline_invlist.c"
90 #include "unicode_constants.h"
92 #define RF_tainted 1 /* tainted information used? e.g. locale */
93 #define RF_warned 2 /* warned about big count? */
95 #define RF_utf8 8 /* Pattern contains multibyte chars? */
97 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
99 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
102 #define STATIC static
105 /* Valid for non-utf8 strings: avoids the reginclass
106 * call if there are no complications: i.e., if everything matchable is
107 * straight forward in the bitmap */
108 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
109 : ANYOF_BITMAP_TEST(p,*(c)))
115 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
116 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
118 #define HOPc(pos,off) \
119 (char *)(PL_reg_match_utf8 \
120 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
122 #define HOPBACKc(pos, off) \
123 (char*)(PL_reg_match_utf8\
124 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
125 : (pos - off >= PL_bostr) \
129 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
130 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
133 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
134 #define NEXTCHR_IS_EOS (nextchr < 0)
136 #define SET_nextchr \
137 nextchr = ((locinput < PL_regeol) ? UCHARAT(locinput) : NEXTCHR_EOS)
139 #define SET_locinput(p) \
144 /* these are unrolled below in the CCC_TRY_XXX defined */
145 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
146 if (!CAT2(PL_utf8_,class)) { \
148 ENTER; save_re_context(); \
149 ok=CAT2(is_utf8_,class)((const U8*)str); \
150 PERL_UNUSED_VAR(ok); \
151 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
152 /* Doesn't do an assert to verify that is correct */
153 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
154 if (!CAT2(PL_utf8_,class)) { \
155 bool throw_away PERL_UNUSED_DECL; \
156 ENTER; save_re_context(); \
157 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
160 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
161 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
162 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
164 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
165 /* No asserts are done for some of these, in case called on a */ \
166 /* Unicode version in which they map to nothing */ \
167 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
168 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
170 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
172 /* The actual code for CCC_TRY, which uses several variables from the routine
173 * it's callable from. It is designed to be the bulk of a case statement.
174 * FUNC is the macro or function to call on non-utf8 targets that indicate if
175 * nextchr matches the class.
176 * UTF8_TEST is the whole test string to use for utf8 targets
177 * LOAD is what to use to test, and if not present to load in the swash for the
179 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
181 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
182 * utf8 and a variant, load the swash if necessary and test using the utf8
183 * test. Advance to the next character if test is ok, otherwise fail; If not
184 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
185 * fails, or advance to the next character */
187 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
188 if (NEXTCHR_IS_EOS) { \
191 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
192 LOAD_UTF8_CHARCLASS(CLASS, STR); \
193 if (POS_OR_NEG (UTF8_TEST)) { \
197 else if (POS_OR_NEG (FUNC(nextchr))) { \
200 goto increment_locinput;
202 /* Handle the non-locale cases for a character class and its complement. It
203 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
204 * This is because that code fails when the test succeeds, so we want to have
205 * the test fail so that the code succeeds. The swash is stored in a
206 * predictable PL_ place */
207 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
210 _CCC_TRY_CODE( !, FUNC, \
211 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
212 (U8*)locinput, TRUE)), \
215 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
216 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
217 (U8*)locinput, TRUE)), \
220 /* Generate the case statements for both locale and non-locale character
221 * classes in regmatch for classes that don't have special unicode semantics.
222 * Locales don't use an immediate swash, but an intermediary special locale
223 * function that is called on the pointer to the current place in the input
224 * string. That function will resolve to needing the same swash. One might
225 * think that because we don't know what the locale will match, we shouldn't
226 * check with the swash loading function that it loaded properly; ie, that we
227 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
228 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
230 #define CCC_TRY(NAME, NNAME, FUNC, \
231 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
232 NAMEA, NNAMEA, FUNCA, \
235 PL_reg_flags |= RF_tainted; \
236 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
238 PL_reg_flags |= RF_tainted; \
239 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
242 if (NEXTCHR_IS_EOS || ! FUNCA(nextchr)) { \
245 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
249 if (NEXTCHR_IS_EOS || FUNCA(nextchr)) { \
252 goto increment_locinput; \
253 /* Generate the non-locale cases */ \
254 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
256 /* This is like CCC_TRY, but has an extra set of parameters for generating case
257 * statements to handle separate Unicode semantics nodes */
258 #define CCC_TRY_U(NAME, NNAME, FUNC, \
259 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
260 NAMEU, NNAMEU, FUNCU, \
261 NAMEA, NNAMEA, FUNCA, \
263 CCC_TRY(NAME, NNAME, FUNC, \
264 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
265 NAMEA, NNAMEA, FUNCA, \
267 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
269 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
271 /* for use after a quantifier and before an EXACT-like node -- japhy */
272 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
274 * NOTE that *nothing* that affects backtracking should be in here, specifically
275 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
276 * node that is in between two EXACT like nodes when ascertaining what the required
277 * "follow" character is. This should probably be moved to regex compile time
278 * although it may be done at run time beause of the REF possibility - more
279 * investigation required. -- demerphq
281 #define JUMPABLE(rn) ( \
283 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
285 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
286 OP(rn) == PLUS || OP(rn) == MINMOD || \
288 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
290 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
292 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
295 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
296 we don't need this definition. */
297 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
298 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
299 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
302 /* ... so we use this as its faster. */
303 #define IS_TEXT(rn) ( OP(rn)==EXACT )
304 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
305 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
306 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
311 Search for mandatory following text node; for lookahead, the text must
312 follow but for lookbehind (rn->flags != 0) we skip to the next step.
314 #define FIND_NEXT_IMPT(rn) STMT_START { \
315 while (JUMPABLE(rn)) { \
316 const OPCODE type = OP(rn); \
317 if (type == SUSPEND || PL_regkind[type] == CURLY) \
318 rn = NEXTOPER(NEXTOPER(rn)); \
319 else if (type == PLUS) \
321 else if (type == IFMATCH) \
322 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
323 else rn += NEXT_OFF(rn); \
328 static void restore_pos(pTHX_ void *arg);
330 #define REGCP_PAREN_ELEMS 3
331 #define REGCP_OTHER_ELEMS 3
332 #define REGCP_FRAME_ELEMS 1
333 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
334 * are needed for the regexp context stack bookkeeping. */
337 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
340 const int retval = PL_savestack_ix;
341 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
342 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
343 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
345 GET_RE_DEBUG_FLAGS_DECL;
347 PERL_ARGS_ASSERT_REGCPPUSH;
349 if (paren_elems_to_push < 0)
350 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
351 paren_elems_to_push);
353 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
354 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
355 " out of range (%lu-%ld)",
356 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
358 SSGROW(total_elems + REGCP_FRAME_ELEMS);
361 if ((int)PL_regsize > (int)parenfloor)
362 PerlIO_printf(Perl_debug_log,
363 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
368 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
369 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
370 SSPUSHINT(rex->offs[p].end);
371 SSPUSHINT(rex->offs[p].start);
372 SSPUSHINT(rex->offs[p].start_tmp);
373 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
374 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
376 (IV)rex->offs[p].start,
377 (IV)rex->offs[p].start_tmp,
381 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
382 SSPUSHINT(PL_regsize);
383 SSPUSHINT(rex->lastparen);
384 SSPUSHINT(rex->lastcloseparen);
385 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
390 /* These are needed since we do not localize EVAL nodes: */
391 #define REGCP_SET(cp) \
393 PerlIO_printf(Perl_debug_log, \
394 " Setting an EVAL scope, savestack=%"IVdf"\n", \
395 (IV)PL_savestack_ix)); \
398 #define REGCP_UNWIND(cp) \
400 if (cp != PL_savestack_ix) \
401 PerlIO_printf(Perl_debug_log, \
402 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
403 (IV)(cp), (IV)PL_savestack_ix)); \
406 #define UNWIND_PAREN(lp, lcp) \
407 for (n = rex->lastparen; n > lp; n--) \
408 rex->offs[n].end = -1; \
409 rex->lastparen = n; \
410 rex->lastcloseparen = lcp;
414 S_regcppop(pTHX_ regexp *rex)
419 GET_RE_DEBUG_FLAGS_DECL;
421 PERL_ARGS_ASSERT_REGCPPOP;
423 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
425 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
426 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
427 rex->lastcloseparen = SSPOPINT;
428 rex->lastparen = SSPOPINT;
429 PL_regsize = SSPOPINT;
431 i -= REGCP_OTHER_ELEMS;
432 /* Now restore the parentheses context. */
434 if (i || rex->lastparen + 1 <= rex->nparens)
435 PerlIO_printf(Perl_debug_log,
436 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
442 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
444 rex->offs[paren].start_tmp = SSPOPINT;
445 rex->offs[paren].start = SSPOPINT;
447 if (paren <= rex->lastparen)
448 rex->offs[paren].end = tmps;
449 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
450 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
452 (IV)rex->offs[paren].start,
453 (IV)rex->offs[paren].start_tmp,
454 (IV)rex->offs[paren].end,
455 (paren > rex->lastparen ? "(skipped)" : ""));
460 /* It would seem that the similar code in regtry()
461 * already takes care of this, and in fact it is in
462 * a better location to since this code can #if 0-ed out
463 * but the code in regtry() is needed or otherwise tests
464 * requiring null fields (pat.t#187 and split.t#{13,14}
465 * (as of patchlevel 7877) will fail. Then again,
466 * this code seems to be necessary or otherwise
467 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
468 * --jhi updated by dapm */
469 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
471 rex->offs[i].start = -1;
472 rex->offs[i].end = -1;
473 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
474 " \\%"UVuf": %s ..-1 undeffing\n",
476 (i > PL_regsize) ? "-1" : " "
482 /* restore the parens and associated vars at savestack position ix,
483 * but without popping the stack */
486 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
488 I32 tmpix = PL_savestack_ix;
489 PL_savestack_ix = ix;
491 PL_savestack_ix = tmpix;
494 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
497 * pregexec and friends
500 #ifndef PERL_IN_XSUB_RE
502 - pregexec - match a regexp against a string
505 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
506 char *strbeg, I32 minend, SV *screamer, U32 nosave)
507 /* stringarg: the point in the string at which to begin matching */
508 /* strend: pointer to null at end of string */
509 /* strbeg: real beginning of string */
510 /* minend: end of match must be >= minend bytes after stringarg. */
511 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
512 * itself is accessed via the pointers above */
513 /* nosave: For optimizations. */
515 PERL_ARGS_ASSERT_PREGEXEC;
518 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
519 nosave ? 0 : REXEC_COPY_STR);
524 * Need to implement the following flags for reg_anch:
526 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
528 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
529 * INTUIT_AUTORITATIVE_ML
530 * INTUIT_ONCE_NOML - Intuit can match in one location only.
533 * Another flag for this function: SECOND_TIME (so that float substrs
534 * with giant delta may be not rechecked).
537 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
539 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
540 Otherwise, only SvCUR(sv) is used to get strbeg. */
542 /* XXXX We assume that strpos is strbeg unless sv. */
544 /* XXXX Some places assume that there is a fixed substring.
545 An update may be needed if optimizer marks as "INTUITable"
546 RExen without fixed substrings. Similarly, it is assumed that
547 lengths of all the strings are no more than minlen, thus they
548 cannot come from lookahead.
549 (Or minlen should take into account lookahead.)
550 NOTE: Some of this comment is not correct. minlen does now take account
551 of lookahead/behind. Further research is required. -- demerphq
555 /* A failure to find a constant substring means that there is no need to make
556 an expensive call to REx engine, thus we celebrate a failure. Similarly,
557 finding a substring too deep into the string means that less calls to
558 regtry() should be needed.
560 REx compiler's optimizer found 4 possible hints:
561 a) Anchored substring;
563 c) Whether we are anchored (beginning-of-line or \G);
564 d) First node (of those at offset 0) which may distinguish positions;
565 We use a)b)d) and multiline-part of c), and try to find a position in the
566 string which does not contradict any of them.
569 /* Most of decisions we do here should have been done at compile time.
570 The nodes of the REx which we used for the search should have been
571 deleted from the finite automaton. */
574 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
575 char *strend, const U32 flags, re_scream_pos_data *data)
578 struct regexp *const prog = (struct regexp *)SvANY(rx);
580 /* Should be nonnegative! */
586 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
588 char *other_last = NULL; /* other substr checked before this */
589 char *check_at = NULL; /* check substr found at this pos */
590 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
591 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
592 RXi_GET_DECL(prog,progi);
594 const char * const i_strpos = strpos;
596 GET_RE_DEBUG_FLAGS_DECL;
598 PERL_ARGS_ASSERT_RE_INTUIT_START;
599 PERL_UNUSED_ARG(flags);
600 PERL_UNUSED_ARG(data);
602 RX_MATCH_UTF8_set(rx,utf8_target);
605 PL_reg_flags |= RF_utf8;
608 debug_start_match(rx, utf8_target, strpos, strend,
609 sv ? "Guessing start of match in sv for"
610 : "Guessing start of match in string for");
613 /* CHR_DIST() would be more correct here but it makes things slow. */
614 if (prog->minlen > strend - strpos) {
615 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
616 "String too short... [re_intuit_start]\n"));
620 /* XXX we need to pass strbeg as a separate arg: the following is
621 * guesswork and can be wrong... */
622 if (sv && SvPOK(sv)) {
623 char * p = SvPVX(sv);
624 STRLEN cur = SvCUR(sv);
625 if (p <= strpos && strpos < p + cur) {
627 assert(p <= strend && strend <= p + cur);
630 strbeg = strend - cur;
637 if (!prog->check_utf8 && prog->check_substr)
638 to_utf8_substr(prog);
639 check = prog->check_utf8;
641 if (!prog->check_substr && prog->check_utf8) {
642 if (! to_byte_substr(prog)) {
643 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
644 non_utf8_target_but_utf8_required));
648 check = prog->check_substr;
650 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
651 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
652 || ( (prog->extflags & RXf_ANCH_BOL)
653 && !multiline ) ); /* Check after \n? */
656 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
657 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
658 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
660 && (strpos != strbeg)) {
661 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
664 if (prog->check_offset_min == prog->check_offset_max &&
665 !(prog->extflags & RXf_CANY_SEEN)) {
666 /* Substring at constant offset from beg-of-str... */
669 s = HOP3c(strpos, prog->check_offset_min, strend);
672 slen = SvCUR(check); /* >= 1 */
674 if ( strend - s > slen || strend - s < slen - 1
675 || (strend - s == slen && strend[-1] != '\n')) {
676 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
679 /* Now should match s[0..slen-2] */
681 if (slen && (*SvPVX_const(check) != *s
683 && memNE(SvPVX_const(check), s, slen)))) {
685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
689 else if (*SvPVX_const(check) != *s
690 || ((slen = SvCUR(check)) > 1
691 && memNE(SvPVX_const(check), s, slen)))
694 goto success_at_start;
697 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
699 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
700 end_shift = prog->check_end_shift;
703 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
704 - (SvTAIL(check) != 0);
705 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
707 if (end_shift < eshift)
711 else { /* Can match at random position */
714 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
715 end_shift = prog->check_end_shift;
717 /* end shift should be non negative here */
720 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
722 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
723 (IV)end_shift, RX_PRECOMP(prog));
727 /* Find a possible match in the region s..strend by looking for
728 the "check" substring in the region corrected by start/end_shift. */
731 I32 srch_start_shift = start_shift;
732 I32 srch_end_shift = end_shift;
735 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
736 srch_end_shift -= ((strbeg - s) - srch_start_shift);
737 srch_start_shift = strbeg - s;
739 DEBUG_OPTIMISE_MORE_r({
740 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
741 (IV)prog->check_offset_min,
742 (IV)srch_start_shift,
744 (IV)prog->check_end_shift);
747 if (prog->extflags & RXf_CANY_SEEN) {
748 start_point= (U8*)(s + srch_start_shift);
749 end_point= (U8*)(strend - srch_end_shift);
751 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
752 end_point= HOP3(strend, -srch_end_shift, strbeg);
754 DEBUG_OPTIMISE_MORE_r({
755 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
756 (int)(end_point - start_point),
757 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
761 s = fbm_instr( start_point, end_point,
762 check, multiline ? FBMrf_MULTILINE : 0);
764 /* Update the count-of-usability, remove useless subpatterns,
768 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
769 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
770 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
771 (s ? "Found" : "Did not find"),
772 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
773 ? "anchored" : "floating"),
776 (s ? " at offset " : "...\n") );
781 /* Finish the diagnostic message */
782 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
784 /* XXX dmq: first branch is for positive lookbehind...
785 Our check string is offset from the beginning of the pattern.
786 So we need to do any stclass tests offset forward from that
795 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
796 Start with the other substr.
797 XXXX no SCREAM optimization yet - and a very coarse implementation
798 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
799 *always* match. Probably should be marked during compile...
800 Probably it is right to do no SCREAM here...
803 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
804 : (prog->float_substr && prog->anchored_substr))
806 /* Take into account the "other" substring. */
807 /* XXXX May be hopelessly wrong for UTF... */
810 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
813 char * const last = HOP3c(s, -start_shift, strbeg);
815 char * const saved_s = s;
818 t = s - prog->check_offset_max;
819 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
821 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
826 t = HOP3c(t, prog->anchored_offset, strend);
827 if (t < other_last) /* These positions already checked */
829 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
832 /* XXXX It is not documented what units *_offsets are in.
833 We assume bytes, but this is clearly wrong.
834 Meaning this code needs to be carefully reviewed for errors.
838 /* On end-of-str: see comment below. */
839 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
840 if (must == &PL_sv_undef) {
842 DEBUG_r(must = prog->anchored_utf8); /* for debug */
847 HOP3(HOP3(last1, prog->anchored_offset, strend)
848 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
850 multiline ? FBMrf_MULTILINE : 0
853 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
854 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
855 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
856 (s ? "Found" : "Contradicts"),
857 quoted, RE_SV_TAIL(must));
862 if (last1 >= last2) {
863 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
864 ", giving up...\n"));
867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
868 ", trying floating at offset %ld...\n",
869 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
870 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
871 s = HOP3c(last, 1, strend);
875 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
876 (long)(s - i_strpos)));
877 t = HOP3c(s, -prog->anchored_offset, strbeg);
878 other_last = HOP3c(s, 1, strend);
886 else { /* Take into account the floating substring. */
888 char * const saved_s = s;
891 t = HOP3c(s, -start_shift, strbeg);
893 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
894 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
895 last = HOP3c(t, prog->float_max_offset, strend);
896 s = HOP3c(t, prog->float_min_offset, strend);
899 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
900 must = utf8_target ? prog->float_utf8 : prog->float_substr;
901 /* fbm_instr() takes into account exact value of end-of-str
902 if the check is SvTAIL(ed). Since false positives are OK,
903 and end-of-str is not later than strend we are OK. */
904 if (must == &PL_sv_undef) {
906 DEBUG_r(must = prog->float_utf8); /* for debug message */
909 s = fbm_instr((unsigned char*)s,
910 (unsigned char*)last + SvCUR(must)
912 must, multiline ? FBMrf_MULTILINE : 0);
914 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
915 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
916 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
917 (s ? "Found" : "Contradicts"),
918 quoted, RE_SV_TAIL(must));
922 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
923 ", giving up...\n"));
926 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
927 ", trying anchored starting at offset %ld...\n",
928 (long)(saved_s + 1 - i_strpos)));
930 s = HOP3c(t, 1, strend);
934 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
935 (long)(s - i_strpos)));
936 other_last = s; /* Fix this later. --Hugo */
946 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
948 DEBUG_OPTIMISE_MORE_r(
949 PerlIO_printf(Perl_debug_log,
950 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
951 (IV)prog->check_offset_min,
952 (IV)prog->check_offset_max,
960 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
962 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
965 /* Fixed substring is found far enough so that the match
966 cannot start at strpos. */
968 if (ml_anch && t[-1] != '\n') {
969 /* Eventually fbm_*() should handle this, but often
970 anchored_offset is not 0, so this check will not be wasted. */
971 /* XXXX In the code below we prefer to look for "^" even in
972 presence of anchored substrings. And we search even
973 beyond the found float position. These pessimizations
974 are historical artefacts only. */
976 while (t < strend - prog->minlen) {
978 if (t < check_at - prog->check_offset_min) {
979 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
980 /* Since we moved from the found position,
981 we definitely contradict the found anchored
982 substr. Due to the above check we do not
983 contradict "check" substr.
984 Thus we can arrive here only if check substr
985 is float. Redo checking for "other"=="fixed".
988 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
989 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
990 goto do_other_anchored;
992 /* We don't contradict the found floating substring. */
993 /* XXXX Why not check for STCLASS? */
995 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
996 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
999 /* Position contradicts check-string */
1000 /* XXXX probably better to look for check-string
1001 than for "\n", so one should lower the limit for t? */
1002 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1003 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
1004 other_last = strpos = s = t + 1;
1009 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
1010 PL_colors[0], PL_colors[1]));
1014 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
1015 PL_colors[0], PL_colors[1]));
1019 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1022 /* The found string does not prohibit matching at strpos,
1023 - no optimization of calling REx engine can be performed,
1024 unless it was an MBOL and we are not after MBOL,
1025 or a future STCLASS check will fail this. */
1027 /* Even in this situation we may use MBOL flag if strpos is offset
1028 wrt the start of the string. */
1029 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1030 && (strpos != strbeg) && strpos[-1] != '\n'
1031 /* May be due to an implicit anchor of m{.*foo} */
1032 && !(prog->intflags & PREGf_IMPLICIT))
1037 DEBUG_EXECUTE_r( if (ml_anch)
1038 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1039 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1042 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1044 prog->check_utf8 /* Could be deleted already */
1045 && --BmUSEFUL(prog->check_utf8) < 0
1046 && (prog->check_utf8 == prog->float_utf8)
1048 prog->check_substr /* Could be deleted already */
1049 && --BmUSEFUL(prog->check_substr) < 0
1050 && (prog->check_substr == prog->float_substr)
1053 /* If flags & SOMETHING - do not do it many times on the same match */
1054 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1055 /* XXX Does the destruction order has to change with utf8_target? */
1056 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1057 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1058 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1059 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1060 check = NULL; /* abort */
1062 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1063 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1064 if (prog->intflags & PREGf_IMPLICIT)
1065 prog->extflags &= ~RXf_ANCH_MBOL;
1066 /* XXXX This is a remnant of the old implementation. It
1067 looks wasteful, since now INTUIT can use many
1068 other heuristics. */
1069 prog->extflags &= ~RXf_USE_INTUIT;
1070 /* XXXX What other flags might need to be cleared in this branch? */
1076 /* Last resort... */
1077 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1078 /* trie stclasses are too expensive to use here, we are better off to
1079 leave it to regmatch itself */
1080 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1081 /* minlen == 0 is possible if regstclass is \b or \B,
1082 and the fixed substr is ''$.
1083 Since minlen is already taken into account, s+1 is before strend;
1084 accidentally, minlen >= 1 guaranties no false positives at s + 1
1085 even for \b or \B. But (minlen? 1 : 0) below assumes that
1086 regstclass does not come from lookahead... */
1087 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1088 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1089 const U8* const str = (U8*)STRING(progi->regstclass);
1090 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1091 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1094 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1095 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1096 else if (prog->float_substr || prog->float_utf8)
1097 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1101 if (checked_upto < s)
1103 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1104 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1107 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1112 const char *what = NULL;
1114 if (endpos == strend) {
1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116 "Could not match STCLASS...\n") );
1119 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1120 "This position contradicts STCLASS...\n") );
1121 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1123 checked_upto = HOPBACKc(endpos, start_shift);
1124 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1125 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1126 /* Contradict one of substrings */
1127 if (prog->anchored_substr || prog->anchored_utf8) {
1128 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1129 DEBUG_EXECUTE_r( what = "anchored" );
1131 s = HOP3c(t, 1, strend);
1132 if (s + start_shift + end_shift > strend) {
1133 /* XXXX Should be taken into account earlier? */
1134 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1135 "Could not match STCLASS...\n") );
1140 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1141 "Looking for %s substr starting at offset %ld...\n",
1142 what, (long)(s + start_shift - i_strpos)) );
1145 /* Have both, check_string is floating */
1146 if (t + start_shift >= check_at) /* Contradicts floating=check */
1147 goto retry_floating_check;
1148 /* Recheck anchored substring, but not floating... */
1152 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1153 "Looking for anchored substr starting at offset %ld...\n",
1154 (long)(other_last - i_strpos)) );
1155 goto do_other_anchored;
1157 /* Another way we could have checked stclass at the
1158 current position only: */
1163 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1164 "Looking for /%s^%s/m starting at offset %ld...\n",
1165 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1168 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1170 /* Check is floating substring. */
1171 retry_floating_check:
1172 t = check_at - start_shift;
1173 DEBUG_EXECUTE_r( what = "floating" );
1174 goto hop_and_restart;
1177 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1178 "By STCLASS: moving %ld --> %ld\n",
1179 (long)(t - i_strpos), (long)(s - i_strpos))
1183 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1184 "Does not contradict STCLASS...\n");
1189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1190 PL_colors[4], (check ? "Guessed" : "Giving up"),
1191 PL_colors[5], (long)(s - i_strpos)) );
1194 fail_finish: /* Substring not found */
1195 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1196 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1198 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1199 PL_colors[4], PL_colors[5]));
1203 #define DECL_TRIE_TYPE(scan) \
1204 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1205 trie_type = ((scan->flags == EXACT) \
1206 ? (utf8_target ? trie_utf8 : trie_plain) \
1207 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1209 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1210 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1212 switch (trie_type) { \
1213 case trie_utf8_fold: \
1214 if ( foldlen>0 ) { \
1215 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1220 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1221 len = UTF8SKIP(uc); \
1222 skiplen = UNISKIP( uvc ); \
1223 foldlen -= skiplen; \
1224 uscan = foldbuf + skiplen; \
1227 case trie_latin_utf8_fold: \
1228 if ( foldlen>0 ) { \
1229 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1235 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1236 skiplen = UNISKIP( uvc ); \
1237 foldlen -= skiplen; \
1238 uscan = foldbuf + skiplen; \
1242 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1249 charid = trie->charmap[ uvc ]; \
1253 if (widecharmap) { \
1254 SV** const svpp = hv_fetch(widecharmap, \
1255 (char*)&uvc, sizeof(UV), 0); \
1257 charid = (U16)SvIV(*svpp); \
1262 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1266 && (ln == 1 || folder(s, pat_string, ln)) \
1267 && (!reginfo || regtry(reginfo, &s)) ) \
1273 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1275 while (s < strend && s + (uskip = UTF8SKIP(s)) <= strend) { \
1281 #define REXEC_FBC_SCAN(CoDe) \
1283 while (s < strend) { \
1289 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1290 REXEC_FBC_UTF8_SCAN( \
1292 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1301 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1304 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1313 #define REXEC_FBC_TRYIT \
1314 if ((!reginfo || regtry(reginfo, &s))) \
1317 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1318 if (utf8_target) { \
1319 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1322 REXEC_FBC_CLASS_SCAN(CoNd); \
1325 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1326 if (utf8_target) { \
1328 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1331 REXEC_FBC_CLASS_SCAN(CoNd); \
1334 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1335 PL_reg_flags |= RF_tainted; \
1336 if (utf8_target) { \
1337 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1340 REXEC_FBC_CLASS_SCAN(CoNd); \
1343 #define DUMP_EXEC_POS(li,s,doutf8) \
1344 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1347 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1348 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1349 tmp = TEST_NON_UTF8(tmp); \
1350 REXEC_FBC_UTF8_SCAN( \
1351 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1360 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1361 if (s == PL_bostr) { \
1365 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1366 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1369 LOAD_UTF8_CHARCLASS_ALNUM(); \
1370 REXEC_FBC_UTF8_SCAN( \
1371 if (tmp == ! (TeSt2_UtF8)) { \
1380 /* The only difference between the BOUND and NBOUND cases is that
1381 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1382 * NBOUND. This is accomplished by passing it in either the if or else clause,
1383 * with the other one being empty */
1384 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1385 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1387 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1388 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1390 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1391 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1393 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1394 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1397 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1398 * be passed in completely with the variable name being tested, which isn't
1399 * such a clean interface, but this is easier to read than it was before. We
1400 * are looking for the boundary (or non-boundary between a word and non-word
1401 * character. The utf8 and non-utf8 cases have the same logic, but the details
1402 * must be different. Find the "wordness" of the character just prior to this
1403 * one, and compare it with the wordness of this one. If they differ, we have
1404 * a boundary. At the beginning of the string, pretend that the previous
1405 * character was a new-line */
1406 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1407 if (utf8_target) { \
1410 else { /* Not utf8 */ \
1411 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1412 tmp = TEST_NON_UTF8(tmp); \
1414 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1423 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1426 /* We know what class REx starts with. Try to find this position... */
1427 /* if reginfo is NULL, its a dryrun */
1428 /* annoyingly all the vars in this routine have different names from their counterparts
1429 in regmatch. /grrr */
1432 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1433 const char *strend, regmatch_info *reginfo)
1436 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1437 char *pat_string; /* The pattern's exactish string */
1438 char *pat_end; /* ptr to end char of pat_string */
1439 re_fold_t folder; /* Function for computing non-utf8 folds */
1440 const U8 *fold_array; /* array for folding ords < 256 */
1447 I32 tmp = 1; /* Scratch variable? */
1448 const bool utf8_target = PL_reg_match_utf8;
1449 UV utf8_fold_flags = 0;
1450 RXi_GET_DECL(prog,progi);
1452 PERL_ARGS_ASSERT_FIND_BYCLASS;
1454 /* We know what class it must start with. */
1458 STRLEN inclasslen = strend - s;
1459 REXEC_FBC_UTF8_CLASS_SCAN(
1460 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1463 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1468 if (tmp && (!reginfo || regtry(reginfo, &s)))
1476 if (UTF_PATTERN || utf8_target) {
1477 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1478 goto do_exactf_utf8;
1480 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1481 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1482 goto do_exactf_non_utf8; /* isn't dealt with by these */
1487 /* regcomp.c already folded this if pattern is in UTF-8 */
1488 utf8_fold_flags = 0;
1489 goto do_exactf_utf8;
1491 fold_array = PL_fold;
1493 goto do_exactf_non_utf8;
1496 if (UTF_PATTERN || utf8_target) {
1497 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1498 goto do_exactf_utf8;
1500 fold_array = PL_fold_locale;
1501 folder = foldEQ_locale;
1502 goto do_exactf_non_utf8;
1506 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1508 goto do_exactf_utf8;
1510 case EXACTFU_TRICKYFOLD:
1512 if (UTF_PATTERN || utf8_target) {
1513 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1514 goto do_exactf_utf8;
1517 /* Any 'ss' in the pattern should have been replaced by regcomp,
1518 * so we don't have to worry here about this single special case
1519 * in the Latin1 range */
1520 fold_array = PL_fold_latin1;
1521 folder = foldEQ_latin1;
1525 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1526 are no glitches with fold-length differences
1527 between the target string and pattern */
1529 /* The idea in the non-utf8 EXACTF* cases is to first find the
1530 * first character of the EXACTF* node and then, if necessary,
1531 * case-insensitively compare the full text of the node. c1 is the
1532 * first character. c2 is its fold. This logic will not work for
1533 * Unicode semantics and the german sharp ss, which hence should
1534 * not be compiled into a node that gets here. */
1535 pat_string = STRING(c);
1536 ln = STR_LEN(c); /* length to match in octets/bytes */
1538 /* We know that we have to match at least 'ln' bytes (which is the
1539 * same as characters, since not utf8). If we have to match 3
1540 * characters, and there are only 2 availabe, we know without
1541 * trying that it will fail; so don't start a match past the
1542 * required minimum number from the far end */
1543 e = HOP3c(strend, -((I32)ln), s);
1545 if (!reginfo && e < s) {
1546 e = s; /* Due to minlen logic of intuit() */
1550 c2 = fold_array[c1];
1551 if (c1 == c2) { /* If char and fold are the same */
1552 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1555 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1564 /* If one of the operands is in utf8, we can't use the simpler
1565 * folding above, due to the fact that many different characters
1566 * can have the same fold, or portion of a fold, or different-
1568 pat_string = STRING(c);
1569 ln = STR_LEN(c); /* length to match in octets/bytes */
1570 pat_end = pat_string + ln;
1571 lnc = (UTF_PATTERN) /* length to match in characters */
1572 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1575 /* We have 'lnc' characters to match in the pattern, but because of
1576 * multi-character folding, each character in the target can match
1577 * up to 3 characters (Unicode guarantees it will never exceed
1578 * this) if it is utf8-encoded; and up to 2 if not (based on the
1579 * fact that the Latin 1 folds are already determined, and the
1580 * only multi-char fold in that range is the sharp-s folding to
1581 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1582 * string character. Adjust lnc accordingly, rounding up, so that
1583 * if we need to match at least 4+1/3 chars, that really is 5. */
1584 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1585 lnc = (lnc + expansion - 1) / expansion;
1587 /* As in the non-UTF8 case, if we have to match 3 characters, and
1588 * only 2 are left, it's guaranteed to fail, so don't start a
1589 * match that would require us to go beyond the end of the string
1591 e = HOP3c(strend, -((I32)lnc), s);
1593 if (!reginfo && e < s) {
1594 e = s; /* Due to minlen logic of intuit() */
1597 /* XXX Note that we could recalculate e to stop the loop earlier,
1598 * as the worst case expansion above will rarely be met, and as we
1599 * go along we would usually find that e moves further to the left.
1600 * This would happen only after we reached the point in the loop
1601 * where if there were no expansion we should fail. Unclear if
1602 * worth the expense */
1605 char *my_strend= (char *)strend;
1606 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1607 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1608 && (!reginfo || regtry(reginfo, &s)) )
1612 s += (utf8_target) ? UTF8SKIP(s) : 1;
1617 PL_reg_flags |= RF_tainted;
1618 FBC_BOUND(isALNUM_LC,
1619 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1620 isALNUM_LC_utf8((U8*)s));
1623 PL_reg_flags |= RF_tainted;
1624 FBC_NBOUND(isALNUM_LC,
1625 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1626 isALNUM_LC_utf8((U8*)s));
1629 FBC_BOUND(isWORDCHAR,
1631 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1634 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1636 isWORDCHAR_A((U8*)s));
1639 FBC_NBOUND(isWORDCHAR,
1641 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1644 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1646 isWORDCHAR_A((U8*)s));
1649 FBC_BOUND(isWORDCHAR_L1,
1651 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1654 FBC_NBOUND(isWORDCHAR_L1,
1656 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1659 REXEC_FBC_CSCAN_TAINT(
1660 isALNUM_LC_utf8((U8*)s),
1665 REXEC_FBC_CSCAN_PRELOAD(
1666 LOAD_UTF8_CHARCLASS_ALNUM(),
1667 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1668 isWORDCHAR_L1((U8) *s)
1672 REXEC_FBC_CSCAN_PRELOAD(
1673 LOAD_UTF8_CHARCLASS_ALNUM(),
1674 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1679 /* Don't need to worry about utf8, as it can match only a single
1680 * byte invariant character */
1681 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1684 REXEC_FBC_CSCAN_PRELOAD(
1685 LOAD_UTF8_CHARCLASS_ALNUM(),
1686 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1687 ! isWORDCHAR_L1((U8) *s)
1691 REXEC_FBC_CSCAN_PRELOAD(
1692 LOAD_UTF8_CHARCLASS_ALNUM(),
1693 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1704 REXEC_FBC_CSCAN_TAINT(
1705 !isALNUM_LC_utf8((U8*)s),
1710 REXEC_FBC_CSCAN_PRELOAD(
1711 LOAD_UTF8_CHARCLASS_SPACE(),
1712 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1717 REXEC_FBC_CSCAN_PRELOAD(
1718 LOAD_UTF8_CHARCLASS_SPACE(),
1719 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1724 /* Don't need to worry about utf8, as it can match only a single
1725 * byte invariant character */
1726 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1729 REXEC_FBC_CSCAN_TAINT(
1730 isSPACE_LC_utf8((U8*)s),
1735 REXEC_FBC_CSCAN_PRELOAD(
1736 LOAD_UTF8_CHARCLASS_SPACE(),
1737 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1738 ! isSPACE_L1((U8) *s)
1742 REXEC_FBC_CSCAN_PRELOAD(
1743 LOAD_UTF8_CHARCLASS_SPACE(),
1744 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1755 REXEC_FBC_CSCAN_TAINT(
1756 !isSPACE_LC_utf8((U8*)s),
1761 REXEC_FBC_CSCAN_PRELOAD(
1762 LOAD_UTF8_CHARCLASS_DIGIT(),
1763 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1768 /* Don't need to worry about utf8, as it can match only a single
1769 * byte invariant character */
1770 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1773 REXEC_FBC_CSCAN_TAINT(
1774 isDIGIT_LC_utf8((U8*)s),
1779 REXEC_FBC_CSCAN_PRELOAD(
1780 LOAD_UTF8_CHARCLASS_DIGIT(),
1781 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1792 REXEC_FBC_CSCAN_TAINT(
1793 !isDIGIT_LC_utf8((U8*)s),
1799 is_LNBREAK_utf8_safe(s, strend),
1800 is_LNBREAK_latin1_safe(s, strend)
1805 is_VERTWS_utf8_safe(s, strend),
1806 is_VERTWS_latin1_safe(s, strend)
1811 !is_VERTWS_utf8_safe(s, strend),
1812 !is_VERTWS_latin1_safe(s, strend)
1817 is_HORIZWS_utf8_safe(s, strend),
1818 is_HORIZWS_latin1_safe(s, strend)
1823 !is_HORIZWS_utf8_safe(s, strend),
1824 !is_HORIZWS_latin1_safe(s, strend)
1828 /* Don't need to worry about utf8, as it can match only a single
1829 * byte invariant character. The flag in this node type is the
1830 * class number to pass to _generic_isCC() to build a mask for
1831 * searching in PL_charclass[] */
1832 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1836 !_generic_isCC_A(*s, FLAGS(c)),
1837 !_generic_isCC_A(*s, FLAGS(c))
1845 /* what trie are we using right now */
1847 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1849 = (reg_trie_data*)progi->data->data[ aho->trie ];
1850 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1852 const char *last_start = strend - trie->minlen;
1854 const char *real_start = s;
1856 STRLEN maxlen = trie->maxlen;
1858 U8 **points; /* map of where we were in the input string
1859 when reading a given char. For ASCII this
1860 is unnecessary overhead as the relationship
1861 is always 1:1, but for Unicode, especially
1862 case folded Unicode this is not true. */
1863 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1867 GET_RE_DEBUG_FLAGS_DECL;
1869 /* We can't just allocate points here. We need to wrap it in
1870 * an SV so it gets freed properly if there is a croak while
1871 * running the match */
1874 sv_points=newSV(maxlen * sizeof(U8 *));
1875 SvCUR_set(sv_points,
1876 maxlen * sizeof(U8 *));
1877 SvPOK_on(sv_points);
1878 sv_2mortal(sv_points);
1879 points=(U8**)SvPV_nolen(sv_points );
1880 if ( trie_type != trie_utf8_fold
1881 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1884 bitmap=(U8*)trie->bitmap;
1886 bitmap=(U8*)ANYOF_BITMAP(c);
1888 /* this is the Aho-Corasick algorithm modified a touch
1889 to include special handling for long "unknown char"
1890 sequences. The basic idea being that we use AC as long
1891 as we are dealing with a possible matching char, when
1892 we encounter an unknown char (and we have not encountered
1893 an accepting state) we scan forward until we find a legal
1895 AC matching is basically that of trie matching, except
1896 that when we encounter a failing transition, we fall back
1897 to the current states "fail state", and try the current char
1898 again, a process we repeat until we reach the root state,
1899 state 1, or a legal transition. If we fail on the root state
1900 then we can either terminate if we have reached an accepting
1901 state previously, or restart the entire process from the beginning
1905 while (s <= last_start) {
1906 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1914 U8 *uscan = (U8*)NULL;
1915 U8 *leftmost = NULL;
1917 U32 accepted_word= 0;
1921 while ( state && uc <= (U8*)strend ) {
1923 U32 word = aho->states[ state ].wordnum;
1927 DEBUG_TRIE_EXECUTE_r(
1928 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1929 dump_exec_pos( (char *)uc, c, strend, real_start,
1930 (char *)uc, utf8_target );
1931 PerlIO_printf( Perl_debug_log,
1932 " Scanning for legal start char...\n");
1936 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1940 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1946 if (uc >(U8*)last_start) break;
1950 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1951 if (!leftmost || lpos < leftmost) {
1952 DEBUG_r(accepted_word=word);
1958 points[pointpos++ % maxlen]= uc;
1959 if (foldlen || uc < (U8*)strend) {
1960 REXEC_TRIE_READ_CHAR(trie_type, trie,
1962 uscan, len, uvc, charid, foldlen,
1964 DEBUG_TRIE_EXECUTE_r({
1965 dump_exec_pos( (char *)uc, c, strend,
1966 real_start, s, utf8_target);
1967 PerlIO_printf(Perl_debug_log,
1968 " Charid:%3u CP:%4"UVxf" ",
1980 word = aho->states[ state ].wordnum;
1982 base = aho->states[ state ].trans.base;
1984 DEBUG_TRIE_EXECUTE_r({
1986 dump_exec_pos( (char *)uc, c, strend, real_start,
1988 PerlIO_printf( Perl_debug_log,
1989 "%sState: %4"UVxf", word=%"UVxf,
1990 failed ? " Fail transition to " : "",
1991 (UV)state, (UV)word);
1997 ( ((offset = base + charid
1998 - 1 - trie->uniquecharcount)) >= 0)
1999 && ((U32)offset < trie->lasttrans)
2000 && trie->trans[offset].check == state
2001 && (tmp=trie->trans[offset].next))
2003 DEBUG_TRIE_EXECUTE_r(
2004 PerlIO_printf( Perl_debug_log," - legal\n"));
2009 DEBUG_TRIE_EXECUTE_r(
2010 PerlIO_printf( Perl_debug_log," - fail\n"));
2012 state = aho->fail[state];
2016 /* we must be accepting here */
2017 DEBUG_TRIE_EXECUTE_r(
2018 PerlIO_printf( Perl_debug_log," - accepting\n"));
2027 if (!state) state = 1;
2030 if ( aho->states[ state ].wordnum ) {
2031 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2032 if (!leftmost || lpos < leftmost) {
2033 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2038 s = (char*)leftmost;
2039 DEBUG_TRIE_EXECUTE_r({
2041 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2042 (UV)accepted_word, (IV)(s - real_start)
2045 if (!reginfo || regtry(reginfo, &s)) {
2051 DEBUG_TRIE_EXECUTE_r({
2052 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2055 DEBUG_TRIE_EXECUTE_r(
2056 PerlIO_printf( Perl_debug_log,"No match.\n"));
2065 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2075 - regexec_flags - match a regexp against a string
2078 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2079 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2080 /* stringarg: the point in the string at which to begin matching */
2081 /* strend: pointer to null at end of string */
2082 /* strbeg: real beginning of string */
2083 /* minend: end of match must be >= minend bytes after stringarg. */
2084 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2085 * itself is accessed via the pointers above */
2086 /* data: May be used for some additional optimizations.
2087 Currently its only used, with a U32 cast, for transmitting
2088 the ganch offset when doing a /g match. This will change */
2089 /* nosave: For optimizations. */
2093 struct regexp *const prog = (struct regexp *)SvANY(rx);
2094 /*register*/ char *s;
2096 /*register*/ char *startpos = stringarg;
2097 I32 minlen; /* must match at least this many chars */
2098 I32 dontbother = 0; /* how many characters not to try at end */
2099 I32 end_shift = 0; /* Same for the end. */ /* CC */
2100 I32 scream_pos = -1; /* Internal iterator of scream. */
2101 char *scream_olds = NULL;
2102 const bool utf8_target = cBOOL(DO_UTF8(sv));
2104 RXi_GET_DECL(prog,progi);
2105 regmatch_info reginfo; /* create some info to pass to regtry etc */
2106 regexp_paren_pair *swap = NULL;
2107 GET_RE_DEBUG_FLAGS_DECL;
2109 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2110 PERL_UNUSED_ARG(data);
2112 /* Be paranoid... */
2113 if (prog == NULL || startpos == NULL) {
2114 Perl_croak(aTHX_ "NULL regexp parameter");
2118 multiline = prog->extflags & RXf_PMf_MULTILINE;
2119 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2121 RX_MATCH_UTF8_set(rx, utf8_target);
2123 debug_start_match(rx, utf8_target, startpos, strend,
2127 minlen = prog->minlen;
2129 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2130 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2131 "String too short [regexec_flags]...\n"));
2136 /* Check validity of program. */
2137 if (UCHARAT(progi->program) != REG_MAGIC) {
2138 Perl_croak(aTHX_ "corrupted regexp program");
2142 PL_reg_state.re_state_eval_setup_done = FALSE;
2146 PL_reg_flags |= RF_utf8;
2148 /* Mark beginning of line for ^ and lookbehind. */
2149 reginfo.bol = startpos; /* XXX not used ??? */
2153 /* Mark end of line for $ (and such) */
2156 /* see how far we have to get to not match where we matched before */
2157 reginfo.till = startpos+minend;
2159 /* If there is a "must appear" string, look for it. */
2162 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2164 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2165 reginfo.ganch = startpos + prog->gofs;
2166 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2167 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2168 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2170 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2171 && mg->mg_len >= 0) {
2172 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2173 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2174 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2176 if (prog->extflags & RXf_ANCH_GPOS) {
2177 if (s > reginfo.ganch)
2179 s = reginfo.ganch - prog->gofs;
2180 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2181 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2187 reginfo.ganch = strbeg + PTR2UV(data);
2188 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2189 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2191 } else { /* pos() not defined */
2192 reginfo.ganch = strbeg;
2193 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2194 "GPOS: reginfo.ganch = strbeg\n"));
2197 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2198 /* We have to be careful. If the previous successful match
2199 was from this regex we don't want a subsequent partially
2200 successful match to clobber the old results.
2201 So when we detect this possibility we add a swap buffer
2202 to the re, and switch the buffer each match. If we fail
2203 we switch it back, otherwise we leave it swapped.
2206 /* do we need a save destructor here for eval dies? */
2207 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2208 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2209 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2215 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2216 re_scream_pos_data d;
2218 d.scream_olds = &scream_olds;
2219 d.scream_pos = &scream_pos;
2220 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2222 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2223 goto phooey; /* not present */
2229 /* Simplest case: anchored match need be tried only once. */
2230 /* [unless only anchor is BOL and multiline is set] */
2231 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2232 if (s == startpos && regtry(®info, &startpos))
2234 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2235 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2240 dontbother = minlen - 1;
2241 end = HOP3c(strend, -dontbother, strbeg) - 1;
2242 /* for multiline we only have to try after newlines */
2243 if (prog->check_substr || prog->check_utf8) {
2244 /* because of the goto we can not easily reuse the macros for bifurcating the
2245 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2248 goto after_try_utf8;
2250 if (regtry(®info, &s)) {
2257 if (prog->extflags & RXf_USE_INTUIT) {
2258 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2267 } /* end search for check string in unicode */
2269 if (s == startpos) {
2270 goto after_try_latin;
2273 if (regtry(®info, &s)) {
2280 if (prog->extflags & RXf_USE_INTUIT) {
2281 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2290 } /* end search for check string in latin*/
2291 } /* end search for check string */
2292 else { /* search for newline */
2294 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2297 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2298 while (s <= end) { /* note it could be possible to match at the end of the string */
2299 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2300 if (regtry(®info, &s))
2304 } /* end search for newline */
2305 } /* end anchored/multiline check string search */
2307 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2309 /* the warning about reginfo.ganch being used without initialization
2310 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2311 and we only enter this block when the same bit is set. */
2312 char *tmp_s = reginfo.ganch - prog->gofs;
2314 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2319 /* Messy cases: unanchored match. */
2320 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2321 /* we have /x+whatever/ */
2322 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2328 if (! prog->anchored_utf8) {
2329 to_utf8_substr(prog);
2331 ch = SvPVX_const(prog->anchored_utf8)[0];
2334 DEBUG_EXECUTE_r( did_match = 1 );
2335 if (regtry(®info, &s)) goto got_it;
2337 while (s < strend && *s == ch)
2344 if (! prog->anchored_substr) {
2345 if (! to_byte_substr(prog)) {
2346 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2347 non_utf8_target_but_utf8_required));
2351 ch = SvPVX_const(prog->anchored_substr)[0];
2354 DEBUG_EXECUTE_r( did_match = 1 );
2355 if (regtry(®info, &s)) goto got_it;
2357 while (s < strend && *s == ch)
2362 DEBUG_EXECUTE_r(if (!did_match)
2363 PerlIO_printf(Perl_debug_log,
2364 "Did not find anchored character...\n")
2367 else if (prog->anchored_substr != NULL
2368 || prog->anchored_utf8 != NULL
2369 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2370 && prog->float_max_offset < strend - s)) {
2375 char *last1; /* Last position checked before */
2379 if (prog->anchored_substr || prog->anchored_utf8) {
2381 if (! prog->anchored_utf8) {
2382 to_utf8_substr(prog);
2384 must = prog->anchored_utf8;
2387 if (! prog->anchored_substr) {
2388 if (! to_byte_substr(prog)) {
2389 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2390 non_utf8_target_but_utf8_required));
2394 must = prog->anchored_substr;
2396 back_max = back_min = prog->anchored_offset;
2399 if (! prog->float_utf8) {
2400 to_utf8_substr(prog);
2402 must = prog->float_utf8;
2405 if (! prog->float_substr) {
2406 if (! to_byte_substr(prog)) {
2407 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2408 non_utf8_target_but_utf8_required));
2412 must = prog->float_substr;
2414 back_max = prog->float_max_offset;
2415 back_min = prog->float_min_offset;
2421 last = HOP3c(strend, /* Cannot start after this */
2422 -(I32)(CHR_SVLEN(must)
2423 - (SvTAIL(must) != 0) + back_min), strbeg);
2426 last1 = HOPc(s, -1);
2428 last1 = s - 1; /* bogus */
2430 /* XXXX check_substr already used to find "s", can optimize if
2431 check_substr==must. */
2433 dontbother = end_shift;
2434 strend = HOPc(strend, -dontbother);
2435 while ( (s <= last) &&
2436 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2437 (unsigned char*)strend, must,
2438 multiline ? FBMrf_MULTILINE : 0)) ) {
2439 DEBUG_EXECUTE_r( did_match = 1 );
2440 if (HOPc(s, -back_max) > last1) {
2441 last1 = HOPc(s, -back_min);
2442 s = HOPc(s, -back_max);
2445 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2447 last1 = HOPc(s, -back_min);
2451 while (s <= last1) {
2452 if (regtry(®info, &s))
2455 s++; /* to break out of outer loop */
2462 while (s <= last1) {
2463 if (regtry(®info, &s))
2469 DEBUG_EXECUTE_r(if (!did_match) {
2470 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2471 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2472 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2473 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2474 ? "anchored" : "floating"),
2475 quoted, RE_SV_TAIL(must));
2479 else if ( (c = progi->regstclass) ) {
2481 const OPCODE op = OP(progi->regstclass);
2482 /* don't bother with what can't match */
2483 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2484 strend = HOPc(strend, -(minlen - 1));
2487 SV * const prop = sv_newmortal();
2488 regprop(prog, prop, c);
2490 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2492 PerlIO_printf(Perl_debug_log,
2493 "Matching stclass %.*s against %s (%d bytes)\n",
2494 (int)SvCUR(prop), SvPVX_const(prop),
2495 quoted, (int)(strend - s));
2498 if (find_byclass(prog, c, s, strend, ®info))
2500 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2504 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2512 if (! prog->float_utf8) {
2513 to_utf8_substr(prog);
2515 float_real = prog->float_utf8;
2518 if (! prog->float_substr) {
2519 if (! to_byte_substr(prog)) {
2520 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2521 non_utf8_target_but_utf8_required));
2525 float_real = prog->float_substr;
2528 little = SvPV_const(float_real, len);
2529 if (SvTAIL(float_real)) {
2530 /* This means that float_real contains an artificial \n on
2531 * the end due to the presence of something like this:
2532 * /foo$/ where we can match both "foo" and "foo\n" at the
2533 * end of the string. So we have to compare the end of the
2534 * string first against the float_real without the \n and
2535 * then against the full float_real with the string. We
2536 * have to watch out for cases where the string might be
2537 * smaller than the float_real or the float_real without
2539 char *checkpos= strend - len;
2541 PerlIO_printf(Perl_debug_log,
2542 "%sChecking for float_real.%s\n",
2543 PL_colors[4], PL_colors[5]));
2544 if (checkpos + 1 < strbeg) {
2545 /* can't match, even if we remove the trailing \n
2546 * string is too short to match */
2548 PerlIO_printf(Perl_debug_log,
2549 "%sString shorter than required trailing substring, cannot match.%s\n",
2550 PL_colors[4], PL_colors[5]));
2552 } else if (memEQ(checkpos + 1, little, len - 1)) {
2553 /* can match, the end of the string matches without the
2555 last = checkpos + 1;
2556 } else if (checkpos < strbeg) {
2557 /* cant match, string is too short when the "\n" is
2560 PerlIO_printf(Perl_debug_log,
2561 "%sString does not contain required trailing substring, cannot match.%s\n",
2562 PL_colors[4], PL_colors[5]));
2564 } else if (!multiline) {
2565 /* non multiline match, so compare with the "\n" at the
2566 * end of the string */
2567 if (memEQ(checkpos, little, len)) {
2571 PerlIO_printf(Perl_debug_log,
2572 "%sString does not contain required trailing substring, cannot match.%s\n",
2573 PL_colors[4], PL_colors[5]));
2577 /* multiline match, so we have to search for a place
2578 * where the full string is located */
2584 last = rninstr(s, strend, little, little + len);
2586 last = strend; /* matching "$" */
2589 /* at one point this block contained a comment which was
2590 * probably incorrect, which said that this was a "should not
2591 * happen" case. Even if it was true when it was written I am
2592 * pretty sure it is not anymore, so I have removed the comment
2593 * and replaced it with this one. Yves */
2595 PerlIO_printf(Perl_debug_log,
2596 "String does not contain required substring, cannot match.\n"
2600 dontbother = strend - last + prog->float_min_offset;
2602 if (minlen && (dontbother < minlen))
2603 dontbother = minlen - 1;
2604 strend -= dontbother; /* this one's always in bytes! */
2605 /* We don't know much -- general case. */
2608 if (regtry(®info, &s))
2617 if (regtry(®info, &s))
2619 } while (s++ < strend);
2629 PerlIO_printf(Perl_debug_log,
2630 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2636 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2638 if (PL_reg_state.re_state_eval_setup_done)
2639 restore_pos(aTHX_ prog);
2640 if (RXp_PAREN_NAMES(prog))
2641 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2643 /* make sure $`, $&, $', and $digit will work later */
2644 if ( !(flags & REXEC_NOT_FIRST) ) {
2645 if (flags & REXEC_COPY_STR) {
2646 #ifdef PERL_OLD_COPY_ON_WRITE
2648 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2650 PerlIO_printf(Perl_debug_log,
2651 "Copy on write: regexp capture, type %d\n",
2654 RX_MATCH_COPY_FREE(rx);
2655 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2656 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2657 assert (SvPOKp(prog->saved_copy));
2658 prog->sublen = PL_regeol - strbeg;
2659 prog->suboffset = 0;
2660 prog->subcoffset = 0;
2665 I32 max = PL_regeol - strbeg;
2668 if ( (flags & REXEC_COPY_SKIP_POST)
2669 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2670 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2671 ) { /* don't copy $' part of string */
2674 /* calculate the right-most part of the string covered
2675 * by a capture. Due to look-ahead, this may be to
2676 * the right of $&, so we have to scan all captures */
2677 while (n <= prog->lastparen) {
2678 if (prog->offs[n].end > max)
2679 max = prog->offs[n].end;
2683 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2684 ? prog->offs[0].start
2686 assert(max >= 0 && max <= PL_regeol - strbeg);
2689 if ( (flags & REXEC_COPY_SKIP_PRE)
2690 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2691 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2692 ) { /* don't copy $` part of string */
2695 /* calculate the left-most part of the string covered
2696 * by a capture. Due to look-behind, this may be to
2697 * the left of $&, so we have to scan all captures */
2698 while (min && n <= prog->lastparen) {
2699 if ( prog->offs[n].start != -1
2700 && prog->offs[n].start < min)
2702 min = prog->offs[n].start;
2706 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2707 && min > prog->offs[0].end
2709 min = prog->offs[0].end;
2713 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2716 if (RX_MATCH_COPIED(rx)) {
2717 if (sublen > prog->sublen)
2719 (char*)saferealloc(prog->subbeg, sublen+1);
2722 prog->subbeg = (char*)safemalloc(sublen+1);
2723 Copy(strbeg + min, prog->subbeg, sublen, char);
2724 prog->subbeg[sublen] = '\0';
2725 prog->suboffset = min;
2726 prog->sublen = sublen;
2727 RX_MATCH_COPIED_on(rx);
2729 prog->subcoffset = prog->suboffset;
2730 if (prog->suboffset && utf8_target) {
2731 /* Convert byte offset to chars.
2732 * XXX ideally should only compute this if @-/@+
2733 * has been seen, a la PL_sawampersand ??? */
2735 /* If there's a direct correspondence between the
2736 * string which we're matching and the original SV,
2737 * then we can use the utf8 len cache associated with
2738 * the SV. In particular, it means that under //g,
2739 * sv_pos_b2u() will use the previously cached
2740 * position to speed up working out the new length of
2741 * subcoffset, rather than counting from the start of
2742 * the string each time. This stops
2743 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2744 * from going quadratic */
2745 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2746 sv_pos_b2u(sv, &(prog->subcoffset));
2748 prog->subcoffset = utf8_length((U8*)strbeg,
2749 (U8*)(strbeg+prog->suboffset));
2753 RX_MATCH_COPY_FREE(rx);
2754 prog->subbeg = strbeg;
2755 prog->suboffset = 0;
2756 prog->subcoffset = 0;
2757 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2764 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2765 PL_colors[4], PL_colors[5]));
2766 if (PL_reg_state.re_state_eval_setup_done)
2767 restore_pos(aTHX_ prog);
2769 /* we failed :-( roll it back */
2770 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2771 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2776 Safefree(prog->offs);
2783 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2784 * Do inc before dec, in case old and new rex are the same */
2785 #define SET_reg_curpm(Re2) \
2786 if (PL_reg_state.re_state_eval_setup_done) { \
2787 (void)ReREFCNT_inc(Re2); \
2788 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2789 PM_SETRE((PL_reg_curpm), (Re2)); \
2794 - regtry - try match at specific point
2796 STATIC I32 /* 0 failure, 1 success */
2797 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2801 REGEXP *const rx = reginfo->prog;
2802 regexp *const prog = (struct regexp *)SvANY(rx);
2804 RXi_GET_DECL(prog,progi);
2805 GET_RE_DEBUG_FLAGS_DECL;
2807 PERL_ARGS_ASSERT_REGTRY;
2809 reginfo->cutpoint=NULL;
2811 if ((prog->extflags & RXf_EVAL_SEEN)
2812 && !PL_reg_state.re_state_eval_setup_done)
2816 PL_reg_state.re_state_eval_setup_done = TRUE;
2818 /* Make $_ available to executed code. */
2819 if (reginfo->sv != DEFSV) {
2821 DEFSV_set(reginfo->sv);
2824 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2825 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2826 /* prepare for quick setting of pos */
2827 #ifdef PERL_OLD_COPY_ON_WRITE
2828 if (SvIsCOW(reginfo->sv))
2829 sv_force_normal_flags(reginfo->sv, 0);
2831 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2832 &PL_vtbl_mglob, NULL, 0);
2836 PL_reg_oldpos = mg->mg_len;
2837 SAVEDESTRUCTOR_X(restore_pos, prog);
2839 if (!PL_reg_curpm) {
2840 Newxz(PL_reg_curpm, 1, PMOP);
2843 SV* const repointer = &PL_sv_undef;
2844 /* this regexp is also owned by the new PL_reg_curpm, which
2845 will try to free it. */
2846 av_push(PL_regex_padav, repointer);
2847 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2848 PL_regex_pad = AvARRAY(PL_regex_padav);
2853 PL_reg_oldcurpm = PL_curpm;
2854 PL_curpm = PL_reg_curpm;
2855 if (RXp_MATCH_COPIED(prog)) {
2856 /* Here is a serious problem: we cannot rewrite subbeg,
2857 since it may be needed if this match fails. Thus
2858 $` inside (?{}) could fail... */
2859 PL_reg_oldsaved = prog->subbeg;
2860 PL_reg_oldsavedlen = prog->sublen;
2861 PL_reg_oldsavedoffset = prog->suboffset;
2862 PL_reg_oldsavedcoffset = prog->suboffset;
2863 #ifdef PERL_OLD_COPY_ON_WRITE
2864 PL_nrs = prog->saved_copy;
2866 RXp_MATCH_COPIED_off(prog);
2869 PL_reg_oldsaved = NULL;
2870 prog->subbeg = PL_bostr;
2871 prog->suboffset = 0;
2872 prog->subcoffset = 0;
2873 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2876 PL_reg_starttry = *startposp;
2878 prog->offs[0].start = *startposp - PL_bostr;
2879 prog->lastparen = 0;
2880 prog->lastcloseparen = 0;
2883 /* XXXX What this code is doing here?!!! There should be no need
2884 to do this again and again, prog->lastparen should take care of
2887 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2888 * Actually, the code in regcppop() (which Ilya may be meaning by
2889 * prog->lastparen), is not needed at all by the test suite
2890 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2891 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2892 * Meanwhile, this code *is* needed for the
2893 * above-mentioned test suite tests to succeed. The common theme
2894 * on those tests seems to be returning null fields from matches.
2895 * --jhi updated by dapm */
2897 if (prog->nparens) {
2898 regexp_paren_pair *pp = prog->offs;
2900 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2908 result = regmatch(reginfo, *startposp, progi->program + 1);
2910 prog->offs[0].end = result;
2913 if (reginfo->cutpoint)
2914 *startposp= reginfo->cutpoint;
2915 REGCP_UNWIND(lastcp);
2920 #define sayYES goto yes
2921 #define sayNO goto no
2922 #define sayNO_SILENT goto no_silent
2924 /* we dont use STMT_START/END here because it leads to
2925 "unreachable code" warnings, which are bogus, but distracting. */
2926 #define CACHEsayNO \
2927 if (ST.cache_mask) \
2928 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2931 /* this is used to determine how far from the left messages like
2932 'failed...' are printed. It should be set such that messages
2933 are inline with the regop output that created them.
2935 #define REPORT_CODE_OFF 32
2938 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2939 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2940 #define CHRTEST_NOT_A_CP_1 -999
2941 #define CHRTEST_NOT_A_CP_2 -998
2943 #define SLAB_FIRST(s) (&(s)->states[0])
2944 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2946 /* grab a new slab and return the first slot in it */
2948 STATIC regmatch_state *
2951 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2954 regmatch_slab *s = PL_regmatch_slab->next;
2956 Newx(s, 1, regmatch_slab);
2957 s->prev = PL_regmatch_slab;
2959 PL_regmatch_slab->next = s;
2961 PL_regmatch_slab = s;
2962 return SLAB_FIRST(s);
2966 /* push a new state then goto it */
2968 #define PUSH_STATE_GOTO(state, node, input) \
2969 pushinput = input; \
2971 st->resume_state = state; \
2974 /* push a new state with success backtracking, then goto it */
2976 #define PUSH_YES_STATE_GOTO(state, node, input) \
2977 pushinput = input; \
2979 st->resume_state = state; \
2980 goto push_yes_state;
2987 regmatch() - main matching routine
2989 This is basically one big switch statement in a loop. We execute an op,
2990 set 'next' to point the next op, and continue. If we come to a point which
2991 we may need to backtrack to on failure such as (A|B|C), we push a
2992 backtrack state onto the backtrack stack. On failure, we pop the top
2993 state, and re-enter the loop at the state indicated. If there are no more
2994 states to pop, we return failure.
2996 Sometimes we also need to backtrack on success; for example /A+/, where
2997 after successfully matching one A, we need to go back and try to
2998 match another one; similarly for lookahead assertions: if the assertion
2999 completes successfully, we backtrack to the state just before the assertion
3000 and then carry on. In these cases, the pushed state is marked as
3001 'backtrack on success too'. This marking is in fact done by a chain of
3002 pointers, each pointing to the previous 'yes' state. On success, we pop to
3003 the nearest yes state, discarding any intermediate failure-only states.
3004 Sometimes a yes state is pushed just to force some cleanup code to be
3005 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3006 it to free the inner regex.
3008 Note that failure backtracking rewinds the cursor position, while
3009 success backtracking leaves it alone.
3011 A pattern is complete when the END op is executed, while a subpattern
3012 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3013 ops trigger the "pop to last yes state if any, otherwise return true"
3016 A common convention in this function is to use A and B to refer to the two
3017 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3018 the subpattern to be matched possibly multiple times, while B is the entire
3019 rest of the pattern. Variable and state names reflect this convention.
3021 The states in the main switch are the union of ops and failure/success of
3022 substates associated with with that op. For example, IFMATCH is the op
3023 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3024 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3025 successfully matched A and IFMATCH_A_fail is a state saying that we have
3026 just failed to match A. Resume states always come in pairs. The backtrack
3027 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3028 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3029 on success or failure.
3031 The struct that holds a backtracking state is actually a big union, with
3032 one variant for each major type of op. The variable st points to the
3033 top-most backtrack struct. To make the code clearer, within each
3034 block of code we #define ST to alias the relevant union.
3036 Here's a concrete example of a (vastly oversimplified) IFMATCH
3042 #define ST st->u.ifmatch
3044 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3045 ST.foo = ...; // some state we wish to save
3047 // push a yes backtrack state with a resume value of
3048 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3050 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3053 case IFMATCH_A: // we have successfully executed A; now continue with B
3055 bar = ST.foo; // do something with the preserved value
3058 case IFMATCH_A_fail: // A failed, so the assertion failed
3059 ...; // do some housekeeping, then ...
3060 sayNO; // propagate the failure
3067 For any old-timers reading this who are familiar with the old recursive
3068 approach, the code above is equivalent to:
3070 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3079 ...; // do some housekeeping, then ...
3080 sayNO; // propagate the failure
3083 The topmost backtrack state, pointed to by st, is usually free. If you
3084 want to claim it, populate any ST.foo fields in it with values you wish to
3085 save, then do one of
3087 PUSH_STATE_GOTO(resume_state, node, newinput);
3088 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3090 which sets that backtrack state's resume value to 'resume_state', pushes a
3091 new free entry to the top of the backtrack stack, then goes to 'node'.
3092 On backtracking, the free slot is popped, and the saved state becomes the
3093 new free state. An ST.foo field in this new top state can be temporarily
3094 accessed to retrieve values, but once the main loop is re-entered, it
3095 becomes available for reuse.
3097 Note that the depth of the backtrack stack constantly increases during the
3098 left-to-right execution of the pattern, rather than going up and down with
3099 the pattern nesting. For example the stack is at its maximum at Z at the
3100 end of the pattern, rather than at X in the following:
3102 /(((X)+)+)+....(Y)+....Z/
3104 The only exceptions to this are lookahead/behind assertions and the cut,
3105 (?>A), which pop all the backtrack states associated with A before
3108 Backtrack state structs are allocated in slabs of about 4K in size.
3109 PL_regmatch_state and st always point to the currently active state,
3110 and PL_regmatch_slab points to the slab currently containing
3111 PL_regmatch_state. The first time regmatch() is called, the first slab is
3112 allocated, and is never freed until interpreter destruction. When the slab
3113 is full, a new one is allocated and chained to the end. At exit from
3114 regmatch(), slabs allocated since entry are freed.
3119 #define DEBUG_STATE_pp(pp) \
3121 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3122 PerlIO_printf(Perl_debug_log, \
3123 " %*s"pp" %s%s%s%s%s\n", \
3125 PL_reg_name[st->resume_state], \
3126 ((st==yes_state||st==mark_state) ? "[" : ""), \
3127 ((st==yes_state) ? "Y" : ""), \
3128 ((st==mark_state) ? "M" : ""), \
3129 ((st==yes_state||st==mark_state) ? "]" : "") \
3134 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3139 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3140 const char *start, const char *end, const char *blurb)
3142 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3144 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3149 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3150 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3152 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3153 start, end - start, 60);
3155 PerlIO_printf(Perl_debug_log,
3156 "%s%s REx%s %s against %s\n",
3157 PL_colors[4], blurb, PL_colors[5], s0, s1);
3159 if (utf8_target||utf8_pat)
3160 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3161 utf8_pat ? "pattern" : "",
3162 utf8_pat && utf8_target ? " and " : "",
3163 utf8_target ? "string" : ""
3169 S_dump_exec_pos(pTHX_ const char *locinput,
3170 const regnode *scan,
3171 const char *loc_regeol,
3172 const char *loc_bostr,
3173 const char *loc_reg_starttry,
3174 const bool utf8_target)
3176 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3177 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3178 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3179 /* The part of the string before starttry has one color
3180 (pref0_len chars), between starttry and current
3181 position another one (pref_len - pref0_len chars),
3182 after the current position the third one.
3183 We assume that pref0_len <= pref_len, otherwise we
3184 decrease pref0_len. */
3185 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3186 ? (5 + taill) - l : locinput - loc_bostr;
3189 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3191 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3193 pref0_len = pref_len - (locinput - loc_reg_starttry);
3194 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3195 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3196 ? (5 + taill) - pref_len : loc_regeol - locinput);
3197 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3201 if (pref0_len > pref_len)
3202 pref0_len = pref_len;
3204 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3206 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3207 (locinput - pref_len),pref0_len, 60, 4, 5);
3209 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3210 (locinput - pref_len + pref0_len),
3211 pref_len - pref0_len, 60, 2, 3);
3213 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3214 locinput, loc_regeol - locinput, 10, 0, 1);
3216 const STRLEN tlen=len0+len1+len2;
3217 PerlIO_printf(Perl_debug_log,
3218 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3219 (IV)(locinput - loc_bostr),
3222 (docolor ? "" : "> <"),
3224 (int)(tlen > 19 ? 0 : 19 - tlen),
3231 /* reg_check_named_buff_matched()
3232 * Checks to see if a named buffer has matched. The data array of
3233 * buffer numbers corresponding to the buffer is expected to reside
3234 * in the regexp->data->data array in the slot stored in the ARG() of
3235 * node involved. Note that this routine doesn't actually care about the
3236 * name, that information is not preserved from compilation to execution.
3237 * Returns the index of the leftmost defined buffer with the given name
3238 * or 0 if non of the buffers matched.
3241 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3244 RXi_GET_DECL(rex,rexi);
3245 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3246 I32 *nums=(I32*)SvPVX(sv_dat);
3248 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3250 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3251 if ((I32)rex->lastparen >= nums[n] &&
3252 rex->offs[nums[n]].end != -1)
3261 /* free all slabs above current one - called during LEAVE_SCOPE */
3264 S_clear_backtrack_stack(pTHX_ void *p)
3266 regmatch_slab *s = PL_regmatch_slab->next;
3271 PL_regmatch_slab->next = NULL;
3273 regmatch_slab * const osl = s;
3279 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8)
3281 /* This function determines if there are one or two characters that match
3282 * the first character of the passed-in EXACTish node <text_node>, and if
3283 * so, returns them in the passed-in pointers.
3285 * If it determines that no possible character in the target string can
3286 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3287 * the first character in <text_node> requires UTF-8 to represent, and the
3288 * target string isn't in UTF-8.)
3290 * If there are more than two characters that could match the beginning of
3291 * <text_node>, or if more context is required to determine a match or not,
3292 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3294 * The motiviation behind this function is to allow the caller to set up
3295 * tight loops for matching. If <text_node> is of type EXACT, there is
3296 * only one possible character that can match its first character, and so
3297 * the situation is quite simple. But things get much more complicated if
3298 * folding is involved. It may be that the first character of an EXACTFish
3299 * node doesn't participate in any possible fold, e.g., punctuation, so it
3300 * can be matched only by itself. The vast majority of characters that are
3301 * in folds match just two things, their lower and upper-case equivalents.
3302 * But not all are like that; some have multiple possible matches, or match
3303 * sequences of more than one character. This function sorts all that out.
3305 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3306 * loop of trying to match A*, we know we can't exit where the thing
3307 * following it isn't a B. And something can't be a B unless it is the
3308 * beginning of B. By putting a quick test for that beginning in a tight
3309 * loop, we can rule out things that can't possibly be B without having to
3310 * break out of the loop, thus avoiding work. Similarly, if A is a single
3311 * character, we can make a tight loop matching A*, using the outputs of
3314 * If the target string to match isn't in UTF-8, and there aren't
3315 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3316 * the one or two possible octets (which are characters in this situation)
3317 * that can match. In all cases, if there is only one character that can
3318 * match, *<c1p> and *<c2p> will be identical.
3320 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3321 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3322 * can match the beginning of <text_node>. They should be declared with at
3323 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3324 * undefined what these contain.) If one or both of the buffers are
3325 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3326 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3327 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3328 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3329 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3331 const bool utf8_target = PL_reg_match_utf8;
3334 bool use_chrtest_void = FALSE;
3336 /* Used when we have both utf8 input and utf8 output, to avoid converting
3337 * to/from code points */
3338 bool utf8_has_been_setup = FALSE;
3342 U8 *pat = (U8*)STRING(text_node);
3344 if (OP(text_node) == EXACT) {
3346 /* In an exact node, only one thing can be matched, that first
3347 * character. If both the pat and the target are UTF-8, we can just
3348 * copy the input to the output, avoiding finding the code point of
3350 if (! UTF_PATTERN) {
3353 else if (utf8_target) {
3354 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3355 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3356 utf8_has_been_setup = TRUE;
3359 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3362 else /* an EXACTFish node */
3364 && is_MULTI_CHAR_FOLD_utf8_safe(pat,
3365 pat + STR_LEN(text_node)))
3367 && is_MULTI_CHAR_FOLD_latin1_safe(pat,
3368 pat + STR_LEN(text_node))))
3370 /* Multi-character folds require more context to sort out. Also
3371 * PL_utf8_foldclosures used below doesn't handle them, so have to be
3372 * handled outside this routine */
3373 use_chrtest_void = TRUE;
3375 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3376 c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3378 /* Load the folds hash, if not already done */
3380 if (! PL_utf8_foldclosures) {
3381 if (! PL_utf8_tofold) {
3382 U8 dummy[UTF8_MAXBYTES+1];
3384 /* Force loading this by folding an above-Latin1 char */
3385 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3386 assert(PL_utf8_tofold); /* Verify that worked */
3388 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3391 /* The fold closures data structure is a hash with the keys being
3392 * the UTF-8 of every character that is folded to, like 'k', and
3393 * the values each an array of all code points that fold to its
3394 * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are
3396 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3401 /* Not found in the hash, therefore there are no folds
3402 * containing it, so there is only a single character that
3406 else { /* Does participate in folds */
3407 AV* list = (AV*) *listp;
3408 if (av_len(list) != 1) {
3410 /* If there aren't exactly two folds to this, it is outside
3411 * the scope of this function */
3412 use_chrtest_void = TRUE;
3414 else { /* There are two. Get them */
3415 SV** c_p = av_fetch(list, 0, FALSE);
3417 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3421 c_p = av_fetch(list, 1, FALSE);
3423 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3427 /* Folds that cross the 255/256 boundary are forbidden if
3428 * EXACTFL, or EXACTFA and one is ASCIII. Since the
3429 * pattern character is above 256, and its only other match
3430 * is below 256, the only legal match will be to itself.
3431 * We have thrown away the original, so have to compute
3432 * which is the one above 255 */
3433 if ((c1 < 256) != (c2 < 256)) {
3434 if (OP(text_node) == EXACTFL
3435 || (OP(text_node) == EXACTFA
3436 && (isASCII(c1) || isASCII(c2))))
3449 else /* Here, c1 is < 255 */
3451 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3452 && OP(text_node) != EXACTFL
3453 && (OP(text_node) != EXACTFA || ! isASCII(c1)))
3455 /* Here, there could be something above Latin1 in the target which
3456 * folds to this character in the pattern. All such cases except
3457 * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters
3458 * involved in their folds, so are outside the scope of this
3460 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3461 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3464 use_chrtest_void = TRUE;
3467 else { /* Here nothing above Latin1 can fold to the pattern character */
3468 switch (OP(text_node)) {
3470 case EXACTFL: /* /l rules */
3471 c2 = PL_fold_locale[c1];
3475 if (! utf8_target) { /* /d rules */
3480 /* /u rules for all these. This happens to work for
3481 * EXACTFA as nothing in Latin1 folds to ASCII */
3483 case EXACTFU_TRICKYFOLD:
3486 c2 = PL_fold_latin1[c1];
3489 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3494 /* Here have figured things out. Set up the returns */
3495 if (use_chrtest_void) {
3496 *c2p = *c1p = CHRTEST_VOID;
3498 else if (utf8_target) {
3499 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3500 uvchr_to_utf8(c1_utf8, c1);
3501 uvchr_to_utf8(c2_utf8, c2);
3504 /* Invariants are stored in both the utf8 and byte outputs; Use
3505 * negative numbers otherwise for the byte ones. Make sure that the
3506 * byte ones are the same iff the utf8 ones are the same */
3507 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3508 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3511 ? CHRTEST_NOT_A_CP_1
3512 : CHRTEST_NOT_A_CP_2;
3514 else if (c1 > 255) {
3515 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3520 *c1p = *c2p = c2; /* c2 is the only representable value */
3522 else { /* c1 is representable; see about c2 */
3524 *c2p = (c2 < 256) ? c2 : c1;
3530 /* returns -1 on failure, $+[0] on success */
3532 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3534 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3538 const bool utf8_target = PL_reg_match_utf8;
3539 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3540 REGEXP *rex_sv = reginfo->prog;
3541 regexp *rex = (struct regexp *)SvANY(rex_sv);
3542 RXi_GET_DECL(rex,rexi);
3544 /* the current state. This is a cached copy of PL_regmatch_state */
3546 /* cache heavy used fields of st in registers */
3549 U32 n = 0; /* general value; init to avoid compiler warning */
3550 I32 ln = 0; /* len or last; init to avoid compiler warning */
3551 char *locinput = startpos;
3552 char *pushinput; /* where to continue after a PUSH */
3553 I32 nextchr; /* is always set to UCHARAT(locinput) */
3555 bool result = 0; /* return value of S_regmatch */
3556 int depth = 0; /* depth of backtrack stack */
3557 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3558 const U32 max_nochange_depth =
3559 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3560 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3561 regmatch_state *yes_state = NULL; /* state to pop to on success of
3563 /* mark_state piggy backs on the yes_state logic so that when we unwind
3564 the stack on success we can update the mark_state as we go */
3565 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3566 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3567 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3569 bool no_final = 0; /* prevent failure from backtracking? */
3570 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3571 char *startpoint = locinput;
3572 SV *popmark = NULL; /* are we looking for a mark? */
3573 SV *sv_commit = NULL; /* last mark name seen in failure */
3574 SV *sv_yes_mark = NULL; /* last mark name we have seen
3575 during a successful match */
3576 U32 lastopen = 0; /* last open we saw */
3577 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3578 SV* const oreplsv = GvSV(PL_replgv);
3579 /* these three flags are set by various ops to signal information to
3580 * the very next op. They have a useful lifetime of exactly one loop
3581 * iteration, and are not preserved or restored by state pushes/pops
3583 bool sw = 0; /* the condition value in (?(cond)a|b) */
3584 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3585 int logical = 0; /* the following EVAL is:
3589 or the following IFMATCH/UNLESSM is:
3590 false: plain (?=foo)
3591 true: used as a condition: (?(?=foo))
3593 PAD* last_pad = NULL;
3595 I32 gimme = G_SCALAR;
3596 CV *caller_cv = NULL; /* who called us */
3597 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3598 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3601 GET_RE_DEBUG_FLAGS_DECL;
3604 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3605 multicall_oldcatch = 0;
3606 multicall_cv = NULL;
3608 PERL_UNUSED_VAR(multicall_cop);
3609 PERL_UNUSED_VAR(newsp);
3612 PERL_ARGS_ASSERT_REGMATCH;
3614 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3615 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3617 /* on first ever call to regmatch, allocate first slab */
3618 if (!PL_regmatch_slab) {
3619 Newx(PL_regmatch_slab, 1, regmatch_slab);
3620 PL_regmatch_slab->prev = NULL;
3621 PL_regmatch_slab->next = NULL;
3622 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3625 oldsave = PL_savestack_ix;
3626 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3627 SAVEVPTR(PL_regmatch_slab);
3628 SAVEVPTR(PL_regmatch_state);
3630 /* grab next free state slot */
3631 st = ++PL_regmatch_state;
3632 if (st > SLAB_LAST(PL_regmatch_slab))
3633 st = PL_regmatch_state = S_push_slab(aTHX);
3635 /* Note that nextchr is a byte even in UTF */
3638 while (scan != NULL) {
3641 SV * const prop = sv_newmortal();
3642 regnode *rnext=regnext(scan);
3643 DUMP_EXEC_POS( locinput, scan, utf8_target );
3644 regprop(rex, prop, scan);
3646 PerlIO_printf(Perl_debug_log,
3647 "%3"IVdf":%*s%s(%"IVdf")\n",
3648 (IV)(scan - rexi->program), depth*2, "",
3650 (PL_regkind[OP(scan)] == END || !rnext) ?
3651 0 : (IV)(rnext - rexi->program));
3654 next = scan + NEXT_OFF(scan);
3657 state_num = OP(scan);
3659 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3663 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3665 switch (state_num) {
3666 case BOL: /* /^../ */
3667 if (locinput == PL_bostr)
3669 /* reginfo->till = reginfo->bol; */
3674 case MBOL: /* /^../m */
3675 if (locinput == PL_bostr ||
3676 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
3682 case SBOL: /* /^../s */
3683 if (locinput == PL_bostr)
3688 if (locinput == reginfo->ganch)
3692 case KEEPS: /* \K */
3693 /* update the startpoint */
3694 st->u.keeper.val = rex->offs[0].start;
3695 rex->offs[0].start = locinput - PL_bostr;
3696 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3698 case KEEPS_next_fail:
3699 /* rollback the start point change */
3700 rex->offs[0].start = st->u.keeper.val;
3704 case EOL: /* /..$/ */
3707 case MEOL: /* /..$/m */
3708 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3712 case SEOL: /* /..$/s */
3714 if (!NEXTCHR_IS_EOS && nextchr != '\n')
3716 if (PL_regeol - locinput > 1)
3721 if (!NEXTCHR_IS_EOS)
3725 case SANY: /* /./s */
3728 goto increment_locinput;
3736 case REG_ANY: /* /./ */
3737 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
3739 goto increment_locinput;
3743 #define ST st->u.trie
3744 case TRIEC: /* (ab|cd) with known charclass */
3745 /* In this case the charclass data is available inline so
3746 we can fail fast without a lot of extra overhead.
3748 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
3750 PerlIO_printf(Perl_debug_log,
3751 "%*s %sfailed to match trie start class...%s\n",
3752 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3755 assert(0); /* NOTREACHED */
3758 case TRIE: /* (ab|cd) */
3759 /* the basic plan of execution of the trie is:
3760 * At the beginning, run though all the states, and
3761 * find the longest-matching word. Also remember the position
3762 * of the shortest matching word. For example, this pattern:
3765 * when matched against the string "abcde", will generate
3766 * accept states for all words except 3, with the longest
3767 * matching word being 4, and the shortest being 2 (with
3768 * the position being after char 1 of the string).
3770 * Then for each matching word, in word order (i.e. 1,2,4,5),
3771 * we run the remainder of the pattern; on each try setting
3772 * the current position to the character following the word,
3773 * returning to try the next word on failure.
3775 * We avoid having to build a list of words at runtime by
3776 * using a compile-time structure, wordinfo[].prev, which
3777 * gives, for each word, the previous accepting word (if any).
3778 * In the case above it would contain the mappings 1->2, 2->0,
3779 * 3->0, 4->5, 5->1. We can use this table to generate, from
3780 * the longest word (4 above), a list of all words, by
3781 * following the list of prev pointers; this gives us the
3782 * unordered list 4,5,1,2. Then given the current word we have
3783 * just tried, we can go through the list and find the
3784 * next-biggest word to try (so if we just failed on word 2,
3785 * the next in the list is 4).
3787 * Since at runtime we don't record the matching position in
3788 * the string for each word, we have to work that out for
3789 * each word we're about to process. The wordinfo table holds
3790 * the character length of each word; given that we recorded
3791 * at the start: the position of the shortest word and its
3792 * length in chars, we just need to move the pointer the
3793 * difference between the two char lengths. Depending on
3794 * Unicode status and folding, that's cheap or expensive.
3796 * This algorithm is optimised for the case where are only a
3797 * small number of accept states, i.e. 0,1, or maybe 2.
3798 * With lots of accepts states, and having to try all of them,
3799 * it becomes quadratic on number of accept states to find all
3804 /* what type of TRIE am I? (utf8 makes this contextual) */
3805 DECL_TRIE_TYPE(scan);
3807 /* what trie are we using right now */
3808 reg_trie_data * const trie
3809 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3810 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3811 U32 state = trie->startstate;
3814 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
3816 if (trie->states[ state ].wordnum) {
3818 PerlIO_printf(Perl_debug_log,
3819 "%*s %smatched empty string...%s\n",
3820 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3826 PerlIO_printf(Perl_debug_log,
3827 "%*s %sfailed to match trie start class...%s\n",
3828 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3835 U8 *uc = ( U8* )locinput;
3839 U8 *uscan = (U8*)NULL;
3840 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3841 U32 charcount = 0; /* how many input chars we have matched */
3842 U32 accepted = 0; /* have we seen any accepting states? */
3844 ST.jump = trie->jump;
3847 ST.longfold = FALSE; /* char longer if folded => it's harder */
3850 /* fully traverse the TRIE; note the position of the
3851 shortest accept state and the wordnum of the longest
3854 while ( state && uc <= (U8*)PL_regeol ) {
3855 U32 base = trie->states[ state ].trans.base;
3859 wordnum = trie->states[ state ].wordnum;
3861 if (wordnum) { /* it's an accept state */
3864 /* record first match position */
3866 ST.firstpos = (U8*)locinput;
3871 ST.firstchars = charcount;
3874 if (!ST.nextword || wordnum < ST.nextword)
3875 ST.nextword = wordnum;
3876 ST.topword = wordnum;
3879 DEBUG_TRIE_EXECUTE_r({
3880 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3881 PerlIO_printf( Perl_debug_log,
3882 "%*s %sState: %4"UVxf" Accepted: %c ",
3883 2+depth * 2, "", PL_colors[4],
3884 (UV)state, (accepted ? 'Y' : 'N'));
3887 /* read a char and goto next state */
3888 if ( base && (foldlen || uc < (U8*)PL_regeol)) {
3890 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3891 uscan, len, uvc, charid, foldlen,
3898 base + charid - 1 - trie->uniquecharcount)) >= 0)
3900 && ((U32)offset < trie->lasttrans)
3901 && trie->trans[offset].check == state)
3903 state = trie->trans[offset].next;
3914 DEBUG_TRIE_EXECUTE_r(
3915 PerlIO_printf( Perl_debug_log,
3916 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3917 charid, uvc, (UV)state, PL_colors[5] );
3923 /* calculate total number of accept states */
3928 w = trie->wordinfo[w].prev;
3931 ST.accepted = accepted;
3935 PerlIO_printf( Perl_debug_log,
3936 "%*s %sgot %"IVdf" possible matches%s\n",
3937 REPORT_CODE_OFF + depth * 2, "",
3938 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3940 goto trie_first_try; /* jump into the fail handler */
3942 assert(0); /* NOTREACHED */
3944 case TRIE_next_fail: /* we failed - try next alternative */
3948 REGCP_UNWIND(ST.cp);
3949 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3951 if (!--ST.accepted) {
3953 PerlIO_printf( Perl_debug_log,
3954 "%*s %sTRIE failed...%s\n",
3955 REPORT_CODE_OFF+depth*2, "",
3962 /* Find next-highest word to process. Note that this code
3963 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3966 U16 const nextword = ST.nextword;
3967 reg_trie_wordinfo * const wordinfo
3968 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3969 for (word=ST.topword; word; word=wordinfo[word].prev) {
3970 if (word > nextword && (!min || word < min))
3983 ST.lastparen = rex->lastparen;
3984 ST.lastcloseparen = rex->lastcloseparen;
3988 /* find start char of end of current word */
3990 U32 chars; /* how many chars to skip */
3991 reg_trie_data * const trie
3992 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3994 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3996 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4001 /* the hard option - fold each char in turn and find
4002 * its folded length (which may be different */
4003 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4011 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
4019 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4024 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
4040 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4041 ? ST.jump[ST.nextword]
4045 PerlIO_printf( Perl_debug_log,
4046 "%*s %sTRIE matched word #%d, continuing%s\n",
4047 REPORT_CODE_OFF+depth*2, "",
4054 if (ST.accepted > 1 || has_cutgroup) {
4055 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4056 assert(0); /* NOTREACHED */
4058 /* only one choice left - just continue */
4060 AV *const trie_words
4061 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4062 SV ** const tmp = av_fetch( trie_words,
4064 SV *sv= tmp ? sv_newmortal() : NULL;
4066 PerlIO_printf( Perl_debug_log,
4067 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4068 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4070 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4071 PL_colors[0], PL_colors[1],
4072 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4074 : "not compiled under -Dr",
4078 locinput = (char*)uc;
4079 continue; /* execute rest of RE */
4080 assert(0); /* NOTREACHED */
4084 case EXACT: { /* /abc/ */
4085 char *s = STRING(scan);
4087 if (utf8_target != UTF_PATTERN) {
4088 /* The target and the pattern have differing utf8ness. */
4090 const char * const e = s + ln;
4093 /* The target is utf8, the pattern is not utf8.
4094 * Above-Latin1 code points can't match the pattern;
4095 * invariants match exactly, and the other Latin1 ones need
4096 * to be downgraded to a single byte in order to do the
4097 * comparison. (If we could be confident that the target
4098 * is not malformed, this could be refactored to have fewer
4099 * tests by just assuming that if the first bytes match, it
4100 * is an invariant, but there are tests in the test suite
4101 * dealing with (??{...}) which violate this) */
4105 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
4108 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4115 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
4124 /* The target is not utf8, the pattern is utf8. */
4126 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4130 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4137 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
4148 /* The target and the pattern have the same utf8ness. */
4149 /* Inline the first character, for speed. */
4150 if (UCHARAT(s) != nextchr)
4152 if (PL_regeol - locinput < ln)
4154 if (ln > 1 && memNE(s, locinput, ln))
4160 case EXACTFL: { /* /abc/il */
4162 const U8 * fold_array;
4164 U32 fold_utf8_flags;
4166 PL_reg_flags |= RF_tainted;
4167 folder = foldEQ_locale;
4168 fold_array = PL_fold_locale;
4169 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
4172 case EXACTFU_SS: /* /\x{df}/iu */
4173 case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */
4174 case EXACTFU: /* /abc/iu */
4175 folder = foldEQ_latin1;
4176 fold_array = PL_fold_latin1;
4177 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4180 case EXACTFA: /* /abc/iaa */
4181 folder = foldEQ_latin1;
4182 fold_array = PL_fold_latin1;
4183 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4186 case EXACTF: /* /abc/i */
4188 fold_array = PL_fold;
4189 fold_utf8_flags = 0;
4195 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
4196 /* Either target or the pattern are utf8, or has the issue where
4197 * the fold lengths may differ. */
4198 const char * const l = locinput;
4199 char *e = PL_regeol;
4201 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
4202 l, &e, 0, utf8_target, fold_utf8_flags))
4210 /* Neither the target nor the pattern are utf8 */
4211 if (UCHARAT(s) != nextchr
4213 && UCHARAT(s) != fold_array[nextchr])
4217 if (PL_regeol - locinput < ln)
4219 if (ln > 1 && ! folder(s, locinput, ln))
4225 /* XXX Could improve efficiency by separating these all out using a
4226 * macro or in-line function. At that point regcomp.c would no longer
4227 * have to set the FLAGS fields of these */
4228 case BOUNDL: /* /\b/l */
4229 case NBOUNDL: /* /\B/l */
4230 PL_reg_flags |= RF_tainted;
4232 case BOUND: /* /\b/ */
4233 case BOUNDU: /* /\b/u */
4234 case BOUNDA: /* /\b/a */
4235 case NBOUND: /* /\B/ */
4236 case NBOUNDU: /* /\B/u */
4237 case NBOUNDA: /* /\B/a */
4238 /* was last char in word? */
4240 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4241 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4243 if (locinput == PL_bostr)
4246 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
4248 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
4250 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4251 ln = isALNUM_uni(ln);
4255 LOAD_UTF8_CHARCLASS_ALNUM();
4256 n = swash_fetch(PL_utf8_alnum, (U8*)locinput,
4261 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
4262 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
4267 /* Here the string isn't utf8, or is utf8 and only ascii
4268 * characters are to match \w. In the latter case looking at
4269 * the byte just prior to the current one may be just the final
4270 * byte of a multi-byte character. This is ok. There are two
4272 * 1) it is a single byte character, and then the test is doing
4273 * just what it's supposed to.
4274 * 2) it is a multi-byte character, in which case the final
4275 * byte is never mistakable for ASCII, and so the test
4276 * will say it is not a word character, which is the
4277 * correct answer. */
4278 ln = (locinput != PL_bostr) ?
4279 UCHARAT(locinput - 1) : '\n';
4280 switch (FLAGS(scan)) {
4281 case REGEX_UNICODE_CHARSET:
4282 ln = isWORDCHAR_L1(ln);
4283 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4285 case REGEX_LOCALE_CHARSET:
4286 ln = isALNUM_LC(ln);
4287 n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
4289 case REGEX_DEPENDS_CHARSET:
4291 n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
4293 case REGEX_ASCII_RESTRICTED_CHARSET:
4294 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4295 ln = isWORDCHAR_A(ln);
4296 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4299 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4303 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4305 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4309 case ANYOF: /* /[abc]/ */
4313 STRLEN inclasslen = PL_regeol - locinput;
4314 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
4316 locinput += inclasslen;
4320 if (!REGINCLASS(rex, scan, (U8*)locinput))
4327 /* Special char classes: \d, \w etc.
4328 * The defines start on line 166 or so */
4329 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
4330 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4331 ALNUMU, NALNUMU, isWORDCHAR_L1,
4332 ALNUMA, NALNUMA, isWORDCHAR_A,
4335 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4336 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4337 SPACEU, NSPACEU, isSPACE_L1,
4338 SPACEA, NSPACEA, isSPACE_A,
4341 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4342 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4343 DIGITA, NDIGITA, isDIGIT_A,
4346 case POSIXA: /* /[[:ascii:]]/ etc */
4347 if (NEXTCHR_IS_EOS || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4350 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4354 case NPOSIXA: /* /[^[:ascii:]]/ etc */
4355 if (NEXTCHR_IS_EOS || _generic_isCC_A(nextchr, FLAGS(scan))) {
4358 goto increment_locinput;
4360 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4361 a Unicode extended Grapheme Cluster */
4362 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4363 extended Grapheme Cluster is:
4366 | Prepend* Begin Extend*
4369 Begin is: ( Special_Begin | ! Control )
4370 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4371 Extend is: ( Grapheme_Extend | Spacing_Mark )
4372 Control is: [ GCB_Control CR LF ]
4373 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4375 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4378 Begin is ( Regular_Begin + Special Begin )
4380 It turns out that 98.4% of all Unicode code points match
4381 Regular_Begin. Doing it this way eliminates a table match in
4382 the previous implementation for almost all Unicode code points.
4384 There is a subtlety with Prepend* which showed up in testing.
4385 Note that the Begin, and only the Begin is required in:
4386 | Prepend* Begin Extend*
4387 Also, Begin contains '! Control'. A Prepend must be a
4388 '! Control', which means it must also be a Begin. What it
4389 comes down to is that if we match Prepend* and then find no
4390 suitable Begin afterwards, that if we backtrack the last
4391 Prepend, that one will be a suitable Begin.
4396 if (! utf8_target) {
4398 /* Match either CR LF or '.', as all the other possibilities
4400 locinput++; /* Match the . or CR */
4401 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4403 && locinput < PL_regeol
4404 && UCHARAT(locinput) == '\n') locinput++;
4408 /* Utf8: See if is ( CR LF ); already know that locinput <
4409 * PL_regeol, so locinput+1 is in bounds */
4410 if ( nextchr == '\r' && locinput+1 < PL_regeol
4411 && UCHARAT(locinput + 1) == '\n')
4418 /* In case have to backtrack to beginning, then match '.' */
4419 char *starting = locinput;
4421 /* In case have to backtrack the last prepend */
4422 char *previous_prepend = 0;
4424 LOAD_UTF8_CHARCLASS_GCB();
4426 /* Match (prepend)* */
4427 while (locinput < PL_regeol
4428 && (len = is_GCB_Prepend_utf8(locinput)))
4430 previous_prepend = locinput;
4434 /* As noted above, if we matched a prepend character, but
4435 * the next thing won't match, back off the last prepend we
4436 * matched, as it is guaranteed to match the begin */
4437 if (previous_prepend
4438 && (locinput >= PL_regeol
4439 || (! swash_fetch(PL_utf8_X_regular_begin,
4440 (U8*)locinput, utf8_target)
4441 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4444 locinput = previous_prepend;
4447 /* Note that here we know PL_regeol > locinput, as we
4448 * tested that upon input to this switch case, and if we
4449 * moved locinput forward, we tested the result just above
4450 * and it either passed, or we backed off so that it will
4452 if (swash_fetch(PL_utf8_X_regular_begin,
4453 (U8*)locinput, utf8_target)) {
4454 locinput += UTF8SKIP(locinput);
4456 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4458 /* Here did not match the required 'Begin' in the
4459 * second term. So just match the very first
4460 * character, the '.' of the final term of the regex */
4461 locinput = starting + UTF8SKIP(starting);
4465 /* Here is a special begin. It can be composed of
4466 * several individual characters. One possibility is
4468 if ((len = is_GCB_RI_utf8(locinput))) {
4470 while (locinput < PL_regeol
4471 && (len = is_GCB_RI_utf8(locinput)))
4475 } else if ((len = is_GCB_T_utf8(locinput))) {
4476 /* Another possibility is T+ */
4478 while (locinput < PL_regeol
4479 && (len = is_GCB_T_utf8(locinput)))
4485 /* Here, neither RI+ nor T+; must be some other
4486 * Hangul. That means it is one of the others: L,
4487 * LV, LVT or V, and matches:
4488 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4491 while (locinput < PL_regeol
4492 && (len = is_GCB_L_utf8(locinput)))
4497 /* Here, have exhausted L*. If the next character
4498 * is not an LV, LVT nor V, it means we had to have
4499 * at least one L, so matches L+ in the original
4500 * equation, we have a complete hangul syllable.
4503 if (locinput < PL_regeol
4504 && is_GCB_LV_LVT_V_utf8(locinput))
4507 /* Otherwise keep going. Must be LV, LVT or V.
4509 if (is_utf8_X_LVT((U8*)locinput)) {
4510 locinput += UTF8SKIP(locinput);
4513 /* Must be V or LV. Take it, then match
4515 locinput += UTF8SKIP(locinput);
4516 while (locinput < PL_regeol
4517 && (len = is_GCB_V_utf8(locinput)))
4523 /* And any of LV, LVT, or V can be followed
4525 while (locinput < PL_regeol
4526 && (len = is_GCB_T_utf8(locinput)))
4534 /* Match any extender */
4535 while (locinput < PL_regeol
4536 && swash_fetch(PL_utf8_X_extend,
4537 (U8*)locinput, utf8_target))
4539 locinput += UTF8SKIP(locinput);
4543 if (locinput > PL_regeol) sayNO;
4547 case NREFFL: /* /\g{name}/il */
4548 { /* The capture buffer cases. The ones beginning with N for the
4549 named buffers just convert to the equivalent numbered and
4550 pretend they were called as the corresponding numbered buffer
4552 /* don't initialize these in the declaration, it makes C++
4557 const U8 *fold_array;
4560 PL_reg_flags |= RF_tainted;
4561 folder = foldEQ_locale;
4562 fold_array = PL_fold_locale;
4564 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4567 case NREFFA: /* /\g{name}/iaa */
4568 folder = foldEQ_latin1;
4569 fold_array = PL_fold_latin1;
4571 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4574 case NREFFU: /* /\g{name}/iu */
4575 folder = foldEQ_latin1;
4576 fold_array = PL_fold_latin1;
4578 utf8_fold_flags = 0;
4581 case NREFF: /* /\g{name}/i */
4583 fold_array = PL_fold;
4585 utf8_fold_flags = 0;
4588 case NREF: /* /\g{name}/ */
4592 utf8_fold_flags = 0;
4595 /* For the named back references, find the corresponding buffer
4597 n = reg_check_named_buff_matched(rex,scan);
4602 goto do_nref_ref_common;
4604 case REFFL: /* /\1/il */
4605 PL_reg_flags |= RF_tainted;
4606 folder = foldEQ_locale;
4607 fold_array = PL_fold_locale;
4608 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4611 case REFFA: /* /\1/iaa */
4612 folder = foldEQ_latin1;
4613 fold_array = PL_fold_latin1;
4614 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4617 case REFFU: /* /\1/iu */
4618 folder = foldEQ_latin1;
4619 fold_array = PL_fold_latin1;
4620 utf8_fold_flags = 0;
4623 case REFF: /* /\1/i */
4625 fold_array = PL_fold;
4626 utf8_fold_flags = 0;
4629 case REF: /* /\1/ */
4632 utf8_fold_flags = 0;
4636 n = ARG(scan); /* which paren pair */
4639 ln = rex->offs[n].start;
4640 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4641 if (rex->lastparen < n || ln == -1)
4642 sayNO; /* Do not match unless seen CLOSEn. */
4643 if (ln == rex->offs[n].end)
4647 if (type != REF /* REF can do byte comparison */
4648 && (utf8_target || type == REFFU))
4649 { /* XXX handle REFFL better */
4650 char * limit = PL_regeol;
4652 /* This call case insensitively compares the entire buffer
4653 * at s, with the current input starting at locinput, but
4654 * not going off the end given by PL_regeol, and returns in
4655 * <limit> upon success, how much of the current input was
4657 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4658 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4666 /* Not utf8: Inline the first character, for speed. */
4667 if (!NEXTCHR_IS_EOS &&
4668 UCHARAT(s) != nextchr &&
4670 UCHARAT(s) != fold_array[nextchr]))
4672 ln = rex->offs[n].end - ln;
4673 if (locinput + ln > PL_regeol)
4675 if (ln > 1 && (type == REF
4676 ? memNE(s, locinput, ln)
4677 : ! folder(s, locinput, ln)))
4683 case NOTHING: /* null op; e.g. the 'nothing' following
4684 * the '*' in m{(a+|b)*}' */
4686 case TAIL: /* placeholder while compiling (A|B|C) */
4689 case BACK: /* ??? doesn't appear to be used ??? */
4693 #define ST st->u.eval
4698 regexp_internal *rei;
4699 regnode *startpoint;
4701 case GOSTART: /* (?R) */
4702 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4703 if (cur_eval && cur_eval->locinput==locinput) {
4704 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4705 Perl_croak(aTHX_ "Infinite recursion in regex");
4706 if ( ++nochange_depth > max_nochange_depth )
4708 "Pattern subroutine nesting without pos change"
4709 " exceeded limit in regex");
4716 if (OP(scan)==GOSUB) {
4717 startpoint = scan + ARG2L(scan);
4718 ST.close_paren = ARG(scan);
4720 startpoint = rei->program+1;
4723 goto eval_recurse_doit;
4724 assert(0); /* NOTREACHED */
4726 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4727 if (cur_eval && cur_eval->locinput==locinput) {
4728 if ( ++nochange_depth > max_nochange_depth )
4729 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4734 /* execute the code in the {...} */
4738 OP * const oop = PL_op;
4739 COP * const ocurcop = PL_curcop;
4741 char *saved_regeol = PL_regeol;
4742 struct re_save_state saved_state;
4745 /* save *all* paren positions */
4747 REGCP_SET(runops_cp);
4749 /* To not corrupt the existing regex state while executing the
4750 * eval we would normally put it on the save stack, like with
4751 * save_re_context. However, re-evals have a weird scoping so we
4752 * can't just add ENTER/LEAVE here. With that, things like
4754 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4756 * would break, as they expect the localisation to be unwound
4757 * only when the re-engine backtracks through the bit that
4760 * What we do instead is just saving the state in a local c
4763 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4765 PL_reg_state.re_reparsing = FALSE;
4768 caller_cv = find_runcv(NULL);
4772 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4773 newcv = ((struct regexp *)SvANY(
4774 (REGEXP*)(rexi->data->data[n])
4777 nop = (OP*)rexi->data->data[n+1];
4779 else if (rexi->data->what[n] == 'l') { /* literal code */
4781 nop = (OP*)rexi->data->data[n];
4782 assert(CvDEPTH(newcv));
4785 /* literal with own CV */
4786 assert(rexi->data->what[n] == 'L');
4787 newcv = rex->qr_anoncv;
4788 nop = (OP*)rexi->data->data[n];
4791 /* normally if we're about to execute code from the same
4792 * CV that we used previously, we just use the existing
4793 * CX stack entry. However, its possible that in the
4794 * meantime we may have backtracked, popped from the save
4795 * stack, and undone the SAVECOMPPAD(s) associated with
4796 * PUSH_MULTICALL; in which case PL_comppad no longer
4797 * points to newcv's pad. */
4798 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4800 I32 depth = (newcv == caller_cv) ? 0 : 1;
4801 if (last_pushed_cv) {
4802 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4805 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4807 last_pushed_cv = newcv;
4809 last_pad = PL_comppad;
4811 /* the initial nextstate you would normally execute
4812 * at the start of an eval (which would cause error
4813 * messages to come from the eval), may be optimised
4814 * away from the execution path in the regex code blocks;
4815 * so manually set PL_curcop to it initially */
4817 OP *o = cUNOPx(nop)->op_first;
4818 assert(o->op_type == OP_NULL);
4819 if (o->op_targ == OP_SCOPE) {
4820 o = cUNOPo->op_first;
4823 assert(o->op_targ == OP_LEAVE);
4824 o = cUNOPo->op_first;
4825 assert(o->op_type == OP_ENTER);
4829 if (o->op_type != OP_STUB) {
4830 assert( o->op_type == OP_NEXTSTATE
4831 || o->op_type == OP_DBSTATE
4832 || (o->op_type == OP_NULL
4833 && ( o->op_targ == OP_NEXTSTATE
4834 || o->op_targ == OP_DBSTATE
4838 PL_curcop = (COP*)o;
4843 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4844 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4846 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4849 SV *sv_mrk = get_sv("REGMARK", 1);
4850 sv_setsv(sv_mrk, sv_yes_mark);
4853 /* we don't use MULTICALL here as we want to call the
4854 * first op of the block of interest, rather than the
4855 * first op of the sub */
4858 CALLRUNOPS(aTHX); /* Scalar context. */
4861 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4867 /* before restoring everything, evaluate the returned
4868 * value, so that 'uninit' warnings don't use the wrong
4869 * PL_op or pad. Also need to process any magic vars
4870 * (e.g. $1) *before* parentheses are restored */
4875 if (logical == 0) /* (?{})/ */
4876 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4877 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4878 sw = cBOOL(SvTRUE(ret));
4881 else { /* /(??{}) */
4882 /* if its overloaded, let the regex compiler handle
4883 * it; otherwise extract regex, or stringify */
4884 if (!SvAMAGIC(ret)) {
4888 if (SvTYPE(sv) == SVt_REGEXP)
4889 re_sv = (REGEXP*) sv;
4890 else if (SvSMAGICAL(sv)) {
4891 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4893 re_sv = (REGEXP *) mg->mg_obj;
4896 /* force any magic, undef warnings here */
4898 ret = sv_mortalcopy(ret);
4899 (void) SvPV_force_nolen(ret);
4905 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4907 /* *** Note that at this point we don't restore
4908 * PL_comppad, (or pop the CxSUB) on the assumption it may
4909 * be used again soon. This is safe as long as nothing
4910 * in the regexp code uses the pad ! */
4912 PL_curcop = ocurcop;
4913 PL_regeol = saved_regeol;
4914 S_regcp_restore(aTHX_ rex, runops_cp);
4920 /* only /(??{})/ from now on */
4923 /* extract RE object from returned value; compiling if
4927 re_sv = reg_temp_copy(NULL, re_sv);
4931 const I32 osize = PL_regsize;
4933 if (SvUTF8(ret) && IN_BYTES) {
4934 /* In use 'bytes': make a copy of the octet
4935 * sequence, but without the flag on */
4937 const char *const p = SvPV(ret, len);
4938 ret = newSVpvn_flags(p, len, SVs_TEMP);
4940 if (rex->intflags & PREGf_USE_RE_EVAL)
4941 pm_flags |= PMf_USE_RE_EVAL;
4943 /* if we got here, it should be an engine which
4944 * supports compiling code blocks and stuff */
4945 assert(rex->engine && rex->engine->op_comp);
4946 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4947 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4948 rex->engine, NULL, NULL,
4949 /* copy /msix etc to inner pattern */
4954 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4956 /* This isn't a first class regexp. Instead, it's
4957 caching a regexp onto an existing, Perl visible
4959 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4962 /* safe to do now that any $1 etc has been
4963 * interpolated into the new pattern string and
4965 S_regcp_restore(aTHX_ rex, runops_cp);
4967 re = (struct regexp *)SvANY(re_sv);
4969 RXp_MATCH_COPIED_off(re);
4970 re->subbeg = rex->subbeg;
4971 re->sublen = rex->sublen;
4972 re->suboffset = rex->suboffset;
4973 re->subcoffset = rex->subcoffset;
4976 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4977 "Matching embedded");
4979 startpoint = rei->program + 1;
4980 ST.close_paren = 0; /* only used for GOSUB */
4982 eval_recurse_doit: /* Share code with GOSUB below this line */
4983 /* run the pattern returned from (??{...}) */
4984 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
4985 REGCP_SET(ST.lastcp);
4988 re->lastcloseparen = 0;
4992 /* XXXX This is too dramatic a measure... */
4995 ST.toggle_reg_flags = PL_reg_flags;
4997 PL_reg_flags |= RF_utf8;
4999 PL_reg_flags &= ~RF_utf8;
5000 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
5002 ST.prev_rex = rex_sv;
5003 ST.prev_curlyx = cur_curlyx;
5005 SET_reg_curpm(rex_sv);
5010 ST.prev_eval = cur_eval;
5012 /* now continue from first node in postoned RE */
5013 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5014 assert(0); /* NOTREACHED */
5017 case EVAL_AB: /* cleanup after a successful (??{A})B */
5018 /* note: this is called twice; first after popping B, then A */
5019 PL_reg_flags ^= ST.toggle_reg_flags;
5020 rex_sv = ST.prev_rex;
5021 SET_reg_curpm(rex_sv);
5022 rex = (struct regexp *)SvANY(rex_sv);
5023 rexi = RXi_GET(rex);
5025 cur_eval = ST.prev_eval;
5026 cur_curlyx = ST.prev_curlyx;
5028 /* XXXX This is too dramatic a measure... */
5030 if ( nochange_depth )
5035 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5036 /* note: this is called twice; first after popping B, then A */
5037 PL_reg_flags ^= ST.toggle_reg_flags;
5038 rex_sv = ST.prev_rex;
5039 SET_reg_curpm(rex_sv);
5040 rex = (struct regexp *)SvANY(rex_sv);
5041 rexi = RXi_GET(rex);
5043 REGCP_UNWIND(ST.lastcp);
5045 cur_eval = ST.prev_eval;
5046 cur_curlyx = ST.prev_curlyx;
5047 /* XXXX This is too dramatic a measure... */
5049 if ( nochange_depth )
5055 n = ARG(scan); /* which paren pair */
5056 rex->offs[n].start_tmp = locinput - PL_bostr;
5059 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5060 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
5064 (IV)rex->offs[n].start_tmp,
5070 /* XXX really need to log other places start/end are set too */
5071 #define CLOSE_CAPTURE \
5072 rex->offs[n].start = rex->offs[n].start_tmp; \
5073 rex->offs[n].end = locinput - PL_bostr; \
5074 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5075 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5077 PTR2UV(rex->offs), \
5079 (IV)rex->offs[n].start, \
5080 (IV)rex->offs[n].end \
5084 n = ARG(scan); /* which paren pair */
5086 /*if (n > PL_regsize)
5088 if (n > rex->lastparen)
5090 rex->lastcloseparen = n;
5091 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5096 case ACCEPT: /* (*ACCEPT) */
5100 cursor && OP(cursor)!=END;
5101 cursor=regnext(cursor))
5103 if ( OP(cursor)==CLOSE ){
5105 if ( n <= lastopen ) {
5107 /*if (n > PL_regsize)
5109 if (n > rex->lastparen)
5111 rex->lastcloseparen = n;
5112 if ( n == ARG(scan) || (cur_eval &&
5113 cur_eval->u.eval.close_paren == n))
5122 case GROUPP: /* (?(1)) */
5123 n = ARG(scan); /* which paren pair */
5124 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5127 case NGROUPP: /* (?(<name>)) */
5128 /* reg_check_named_buff_matched returns 0 for no match */
5129 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5132 case INSUBP: /* (?(R)) */
5134 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5137 case DEFINEP: /* (?(DEFINE)) */
5141 case IFTHEN: /* (?(cond)A|B) */
5142 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
5144 next = NEXTOPER(NEXTOPER(scan));
5146 next = scan + ARG(scan);
5147 if (OP(next) == IFTHEN) /* Fake one. */
5148 next = NEXTOPER(NEXTOPER(next));
5152 case LOGICAL: /* modifier for EVAL and IFMATCH */
5153 logical = scan->flags;
5156 /*******************************************************************
5158 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5159 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5160 STAR/PLUS/CURLY/CURLYN are used instead.)
5162 A*B is compiled as <CURLYX><A><WHILEM><B>
5164 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5165 state, which contains the current count, initialised to -1. It also sets
5166 cur_curlyx to point to this state, with any previous value saved in the
5169 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5170 since the pattern may possibly match zero times (i.e. it's a while {} loop
5171 rather than a do {} while loop).
5173 Each entry to WHILEM represents a successful match of A. The count in the
5174 CURLYX block is incremented, another WHILEM state is pushed, and execution
5175 passes to A or B depending on greediness and the current count.
5177 For example, if matching against the string a1a2a3b (where the aN are
5178 substrings that match /A/), then the match progresses as follows: (the
5179 pushed states are interspersed with the bits of strings matched so far):
5182 <CURLYX cnt=0><WHILEM>
5183 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5184 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5185 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5186 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5188 (Contrast this with something like CURLYM, which maintains only a single
5192 a1 <CURLYM cnt=1> a2
5193 a1 a2 <CURLYM cnt=2> a3
5194 a1 a2 a3 <CURLYM cnt=3> b
5197 Each WHILEM state block marks a point to backtrack to upon partial failure
5198 of A or B, and also contains some minor state data related to that
5199 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5200 overall state, such as the count, and pointers to the A and B ops.
5202 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5203 must always point to the *current* CURLYX block, the rules are:
5205 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5206 and set cur_curlyx to point the new block.
5208 When popping the CURLYX block after a successful or unsuccessful match,
5209 restore the previous cur_curlyx.
5211 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5212 to the outer one saved in the CURLYX block.
5214 When popping the WHILEM block after a successful or unsuccessful B match,
5215 restore the previous cur_curlyx.
5217 Here's an example for the pattern (AI* BI)*BO
5218 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5221 curlyx backtrack stack
5222 ------ ---------------
5224 CO <CO prev=NULL> <WO>
5225 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5226 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5227 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5229 At this point the pattern succeeds, and we work back down the stack to
5230 clean up, restoring as we go:
5232 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5233 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5234 CO <CO prev=NULL> <WO>
5237 *******************************************************************/
5239 #define ST st->u.curlyx
5241 case CURLYX: /* start of /A*B/ (for complex A) */
5243 /* No need to save/restore up to this paren */
5244 I32 parenfloor = scan->flags;
5246 assert(next); /* keep Coverity happy */
5247 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5250 /* XXXX Probably it is better to teach regpush to support
5251 parenfloor > PL_regsize... */
5252 if (parenfloor > (I32)rex->lastparen)
5253 parenfloor = rex->lastparen; /* Pessimization... */
5255 ST.prev_curlyx= cur_curlyx;
5257 ST.cp = PL_savestack_ix;
5259 /* these fields contain the state of the current curly.
5260 * they are accessed by subsequent WHILEMs */
5261 ST.parenfloor = parenfloor;
5266 ST.count = -1; /* this will be updated by WHILEM */
5267 ST.lastloc = NULL; /* this will be updated by WHILEM */
5269 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5270 assert(0); /* NOTREACHED */
5273 case CURLYX_end: /* just finished matching all of A*B */
5274 cur_curlyx = ST.prev_curlyx;
5276 assert(0); /* NOTREACHED */
5278 case CURLYX_end_fail: /* just failed to match all of A*B */
5280 cur_curlyx = ST.prev_curlyx;
5282 assert(0); /* NOTREACHED */
5286 #define ST st->u.whilem
5288 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5290 /* see the discussion above about CURLYX/WHILEM */
5292 int min = ARG1(cur_curlyx->u.curlyx.me);
5293 int max = ARG2(cur_curlyx->u.curlyx.me);
5294 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5296 assert(cur_curlyx); /* keep Coverity happy */
5297 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5298 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5299 ST.cache_offset = 0;
5303 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5304 "%*s whilem: matched %ld out of %d..%d\n",
5305 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5308 /* First just match a string of min A's. */
5311 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5312 cur_curlyx->u.curlyx.lastloc = locinput;
5313 REGCP_SET(ST.lastcp);
5315 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5316 assert(0); /* NOTREACHED */
5319 /* If degenerate A matches "", assume A done. */
5321 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5322 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5323 "%*s whilem: empty match detected, trying continuation...\n",
5324 REPORT_CODE_OFF+depth*2, "")
5326 goto do_whilem_B_max;
5329 /* super-linear cache processing */
5333 if (!PL_reg_maxiter) {
5334 /* start the countdown: Postpone detection until we
5335 * know the match is not *that* much linear. */
5336 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5337 /* possible overflow for long strings and many CURLYX's */
5338 if (PL_reg_maxiter < 0)
5339 PL_reg_maxiter = I32_MAX;
5340 PL_reg_leftiter = PL_reg_maxiter;
5343 if (PL_reg_leftiter-- == 0) {
5344 /* initialise cache */
5345 const I32 size = (PL_reg_maxiter + 7)/8;
5346 if (PL_reg_poscache) {
5347 if ((I32)PL_reg_poscache_size < size) {
5348 Renew(PL_reg_poscache, size, char);
5349 PL_reg_poscache_size = size;
5351 Zero(PL_reg_poscache, size, char);
5354 PL_reg_poscache_size = size;
5355 Newxz(PL_reg_poscache, size, char);
5357 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5358 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5359 PL_colors[4], PL_colors[5])
5363 if (PL_reg_leftiter < 0) {
5364 /* have we already failed at this position? */
5366 offset = (scan->flags & 0xf) - 1
5367 + (locinput - PL_bostr) * (scan->flags>>4);
5368 mask = 1 << (offset % 8);
5370 if (PL_reg_poscache[offset] & mask) {
5371 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5372 "%*s whilem: (cache) already tried at this position...\n",
5373 REPORT_CODE_OFF+depth*2, "")
5375 sayNO; /* cache records failure */
5377 ST.cache_offset = offset;
5378 ST.cache_mask = mask;
5382 /* Prefer B over A for minimal matching. */
5384 if (cur_curlyx->u.curlyx.minmod) {
5385 ST.save_curlyx = cur_curlyx;
5386 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5387 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5388 REGCP_SET(ST.lastcp);
5389 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5391 assert(0); /* NOTREACHED */
5394 /* Prefer A over B for maximal matching. */
5396 if (n < max) { /* More greed allowed? */
5397 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5398 cur_curlyx->u.curlyx.lastloc = locinput;
5399 REGCP_SET(ST.lastcp);
5400 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5401 assert(0); /* NOTREACHED */
5403 goto do_whilem_B_max;
5405 assert(0); /* NOTREACHED */
5407 case WHILEM_B_min: /* just matched B in a minimal match */
5408 case WHILEM_B_max: /* just matched B in a maximal match */
5409 cur_curlyx = ST.save_curlyx;
5411 assert(0); /* NOTREACHED */
5413 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5414 cur_curlyx = ST.save_curlyx;
5415 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5416 cur_curlyx->u.curlyx.count--;
5418 assert(0); /* NOTREACHED */
5420 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5422 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5423 REGCP_UNWIND(ST.lastcp);
5425 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5426 cur_curlyx->u.curlyx.count--;
5428 assert(0); /* NOTREACHED */
5430 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5431 REGCP_UNWIND(ST.lastcp);
5432 regcppop(rex); /* Restore some previous $<digit>s? */
5433 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5434 "%*s whilem: failed, trying continuation...\n",
5435 REPORT_CODE_OFF+depth*2, "")
5438 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5439 && ckWARN(WARN_REGEXP)
5440 && !(PL_reg_flags & RF_warned))
5442 PL_reg_flags |= RF_warned;
5443 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5444 "Complex regular subexpression recursion limit (%d) "
5450 ST.save_curlyx = cur_curlyx;
5451 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5452 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5454 assert(0); /* NOTREACHED */
5456 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5457 cur_curlyx = ST.save_curlyx;
5458 REGCP_UNWIND(ST.lastcp);
5461 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5462 /* Maximum greed exceeded */
5463 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5464 && ckWARN(WARN_REGEXP)
5465 && !(PL_reg_flags & RF_warned))
5467 PL_reg_flags |= RF_warned;
5468 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5469 "Complex regular subexpression recursion "
5470 "limit (%d) exceeded",
5473 cur_curlyx->u.curlyx.count--;
5477 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5478 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5480 /* Try grabbing another A and see if it helps. */
5481 cur_curlyx->u.curlyx.lastloc = locinput;
5482 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5483 REGCP_SET(ST.lastcp);
5484 PUSH_STATE_GOTO(WHILEM_A_min,
5485 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5487 assert(0); /* NOTREACHED */
5490 #define ST st->u.branch
5492 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5493 next = scan + ARG(scan);
5496 scan = NEXTOPER(scan);
5499 case BRANCH: /* /(...|A|...)/ */
5500 scan = NEXTOPER(scan); /* scan now points to inner node */
5501 ST.lastparen = rex->lastparen;
5502 ST.lastcloseparen = rex->lastcloseparen;
5503 ST.next_branch = next;
5506 /* Now go into the branch */
5508 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5510 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5512 assert(0); /* NOTREACHED */
5514 case CUTGROUP: /* /(*THEN)/ */
5515 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5516 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5517 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5518 assert(0); /* NOTREACHED */
5520 case CUTGROUP_next_fail:
5523 if (st->u.mark.mark_name)
5524 sv_commit = st->u.mark.mark_name;
5526 assert(0); /* NOTREACHED */
5530 assert(0); /* NOTREACHED */
5532 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5537 REGCP_UNWIND(ST.cp);
5538 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5539 scan = ST.next_branch;
5540 /* no more branches? */
5541 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5543 PerlIO_printf( Perl_debug_log,
5544 "%*s %sBRANCH failed...%s\n",
5545 REPORT_CODE_OFF+depth*2, "",
5551 continue; /* execute next BRANCH[J] op */
5552 assert(0); /* NOTREACHED */
5554 case MINMOD: /* next op will be non-greedy, e.g. A*? */
5559 #define ST st->u.curlym
5561 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5563 /* This is an optimisation of CURLYX that enables us to push
5564 * only a single backtracking state, no matter how many matches
5565 * there are in {m,n}. It relies on the pattern being constant
5566 * length, with no parens to influence future backrefs
5570 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5572 ST.lastparen = rex->lastparen;
5573 ST.lastcloseparen = rex->lastcloseparen;
5575 /* if paren positive, emulate an OPEN/CLOSE around A */
5577 U32 paren = ST.me->flags;
5578 if (paren > PL_regsize)
5580 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5588 ST.c1 = CHRTEST_UNINIT;
5591 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5594 curlym_do_A: /* execute the A in /A{m,n}B/ */
5595 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5596 assert(0); /* NOTREACHED */
5598 case CURLYM_A: /* we've just matched an A */
5600 /* after first match, determine A's length: u.curlym.alen */
5601 if (ST.count == 1) {
5602 if (PL_reg_match_utf8) {
5603 char *s = st->locinput;
5604 while (s < locinput) {
5610 ST.alen = locinput - st->locinput;
5613 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5616 PerlIO_printf(Perl_debug_log,
5617 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5618 (int)(REPORT_CODE_OFF+(depth*2)), "",
5619 (IV) ST.count, (IV)ST.alen)
5622 if (cur_eval && cur_eval->u.eval.close_paren &&
5623 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5627 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5628 if ( max == REG_INFTY || ST.count < max )
5629 goto curlym_do_A; /* try to match another A */
5631 goto curlym_do_B; /* try to match B */
5633 case CURLYM_A_fail: /* just failed to match an A */
5634 REGCP_UNWIND(ST.cp);
5636 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5637 || (cur_eval && cur_eval->u.eval.close_paren &&
5638 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5641 curlym_do_B: /* execute the B in /A{m,n}B/ */
5642 if (ST.c1 == CHRTEST_UNINIT) {
5643 /* calculate c1 and c2 for possible match of 1st char
5644 * following curly */
5645 ST.c1 = ST.c2 = CHRTEST_VOID;
5646 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5647 regnode *text_node = ST.B;
5648 if (! HAS_TEXT(text_node))
5649 FIND_NEXT_IMPT(text_node);
5652 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5654 But the former is redundant in light of the latter.
5656 if this changes back then the macro for
5657 IS_TEXT and friends need to change.
5659 if (PL_regkind[OP(text_node)] == EXACT) {
5660 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5661 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5670 PerlIO_printf(Perl_debug_log,
5671 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5672 (int)(REPORT_CODE_OFF+(depth*2)),
5675 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
5676 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
5677 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5678 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5680 /* simulate B failing */
5682 PerlIO_printf(Perl_debug_log,
5683 "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n",
5684 (int)(REPORT_CODE_OFF+(depth*2)),"",
5685 valid_utf8_to_uvchr((U8 *) locinput, NULL),
5686 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
5687 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
5689 state_num = CURLYM_B_fail;
5690 goto reenter_switch;
5693 else if (nextchr != ST.c1 && nextchr != ST.c2) {
5694 /* simulate B failing */
5696 PerlIO_printf(Perl_debug_log,
5697 "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n",
5698 (int)(REPORT_CODE_OFF+(depth*2)),"",
5699 (int) nextchr, ST.c1, ST.c2)
5701 state_num = CURLYM_B_fail;
5702 goto reenter_switch;
5707 /* emulate CLOSE: mark current A as captured */
5708 I32 paren = ST.me->flags;
5710 rex->offs[paren].start
5711 = HOPc(locinput, -ST.alen) - PL_bostr;
5712 rex->offs[paren].end = locinput - PL_bostr;
5713 if ((U32)paren > rex->lastparen)
5714 rex->lastparen = paren;
5715 rex->lastcloseparen = paren;
5718 rex->offs[paren].end = -1;
5719 if (cur_eval && cur_eval->u.eval.close_paren &&
5720 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5729 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5730 assert(0); /* NOTREACHED */
5732 case CURLYM_B_fail: /* just failed to match a B */
5733 REGCP_UNWIND(ST.cp);
5734 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5736 I32 max = ARG2(ST.me);
5737 if (max != REG_INFTY && ST.count == max)
5739 goto curlym_do_A; /* try to match a further A */
5741 /* backtrack one A */
5742 if (ST.count == ARG1(ST.me) /* min */)
5745 SET_locinput(HOPc(locinput, -ST.alen));
5746 goto curlym_do_B; /* try to match B */
5749 #define ST st->u.curly
5751 #define CURLY_SETPAREN(paren, success) \
5754 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5755 rex->offs[paren].end = locinput - PL_bostr; \
5756 if (paren > rex->lastparen) \
5757 rex->lastparen = paren; \
5758 rex->lastcloseparen = paren; \
5761 rex->offs[paren].end = -1; \
5762 rex->lastparen = ST.lastparen; \
5763 rex->lastcloseparen = ST.lastcloseparen; \
5767 case STAR: /* /A*B/ where A is width 1 char */
5771 scan = NEXTOPER(scan);
5774 case PLUS: /* /A+B/ where A is width 1 char */
5778 scan = NEXTOPER(scan);
5781 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
5782 ST.paren = scan->flags; /* Which paren to set */
5783 ST.lastparen = rex->lastparen;
5784 ST.lastcloseparen = rex->lastcloseparen;
5785 if (ST.paren > PL_regsize)
5786 PL_regsize = ST.paren;
5787 ST.min = ARG1(scan); /* min to match */
5788 ST.max = ARG2(scan); /* max to match */
5789 if (cur_eval && cur_eval->u.eval.close_paren &&
5790 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5794 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5797 case CURLY: /* /A{m,n}B/ where A is width 1 char */
5799 ST.min = ARG1(scan); /* min to match */
5800 ST.max = ARG2(scan); /* max to match */
5801 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5804 * Lookahead to avoid useless match attempts
5805 * when we know what character comes next.
5807 * Used to only do .*x and .*?x, but now it allows
5808 * for )'s, ('s and (?{ ... })'s to be in the way
5809 * of the quantifier and the EXACT-like node. -- japhy
5812 assert(ST.min <= ST.max);
5813 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
5814 ST.c1 = ST.c2 = CHRTEST_VOID;
5817 regnode *text_node = next;
5819 if (! HAS_TEXT(text_node))
5820 FIND_NEXT_IMPT(text_node);
5822 if (! HAS_TEXT(text_node))
5823 ST.c1 = ST.c2 = CHRTEST_VOID;
5825 if ( PL_regkind[OP(text_node)] != EXACT ) {
5826 ST.c1 = ST.c2 = CHRTEST_VOID;
5830 /* Currently we only get here when
5832 PL_rekind[OP(text_node)] == EXACT
5834 if this changes back then the macro for IS_TEXT and
5835 friends need to change. */
5836 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
5837 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8))
5848 char *li = locinput;
5850 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5855 if (ST.c1 == CHRTEST_VOID)
5856 goto curly_try_B_min;
5858 ST.oldloc = locinput;
5860 /* set ST.maxpos to the furthest point along the
5861 * string that could possibly match */
5862 if (ST.max == REG_INFTY) {
5863 ST.maxpos = PL_regeol - 1;
5865 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5868 else if (utf8_target) {
5869 int m = ST.max - ST.min;
5870 for (ST.maxpos = locinput;
5871 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5872 ST.maxpos += UTF8SKIP(ST.maxpos);
5875 ST.maxpos = locinput + ST.max - ST.min;
5876 if (ST.maxpos >= PL_regeol)
5877 ST.maxpos = PL_regeol - 1;
5879 goto curly_try_B_min_known;
5883 /* avoid taking address of locinput, so it can remain
5885 char *li = locinput;
5886 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5887 if (ST.count < ST.min)
5890 if ((ST.count > ST.min)
5891 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5893 /* A{m,n} must come at the end of the string, there's
5894 * no point in backing off ... */
5896 /* ...except that $ and \Z can match before *and* after
5897 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5898 We may back off by one in this case. */
5899 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5903 goto curly_try_B_max;
5905 assert(0); /* NOTREACHED */
5908 case CURLY_B_min_known_fail:
5909 /* failed to find B in a non-greedy match where c1,c2 valid */
5911 REGCP_UNWIND(ST.cp);
5913 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5915 /* Couldn't or didn't -- move forward. */
5916 ST.oldloc = locinput;
5918 locinput += UTF8SKIP(locinput);
5922 curly_try_B_min_known:
5923 /* find the next place where 'B' could work, then call B */
5927 n = (ST.oldloc == locinput) ? 0 : 1;
5928 if (ST.c1 == ST.c2) {
5929 /* set n to utf8_distance(oldloc, locinput) */
5930 while (locinput <= ST.maxpos
5931 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
5933 locinput += UTF8SKIP(locinput);
5938 /* set n to utf8_distance(oldloc, locinput) */
5939 while (locinput <= ST.maxpos
5940 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
5941 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
5943 locinput += UTF8SKIP(locinput);
5948 else { /* Not utf8_target */
5949 if (ST.c1 == ST.c2) {
5950 while (locinput <= ST.maxpos &&
5951 UCHARAT(locinput) != ST.c1)
5955 while (locinput <= ST.maxpos
5956 && UCHARAT(locinput) != ST.c1
5957 && UCHARAT(locinput) != ST.c2)
5960 n = locinput - ST.oldloc;
5962 if (locinput > ST.maxpos)
5965 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5966 * at b; check that everything between oldloc and
5967 * locinput matches */
5968 char *li = ST.oldloc;
5970 if (regrepeat(rex, &li, ST.A, n, depth) < n)
5972 assert(n == REG_INFTY || locinput == li);
5974 CURLY_SETPAREN(ST.paren, ST.count);
5975 if (cur_eval && cur_eval->u.eval.close_paren &&
5976 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5979 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5981 assert(0); /* NOTREACHED */
5984 case CURLY_B_min_fail:
5985 /* failed to find B in a non-greedy match where c1,c2 invalid */
5987 REGCP_UNWIND(ST.cp);
5989 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5991 /* failed -- move forward one */
5993 char *li = locinput;
5994 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
6001 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6002 ST.count > 0)) /* count overflow ? */
6005 CURLY_SETPAREN(ST.paren, ST.count);
6006 if (cur_eval && cur_eval->u.eval.close_paren &&
6007 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6010 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6014 assert(0); /* NOTREACHED */
6018 /* a successful greedy match: now try to match B */
6019 if (cur_eval && cur_eval->u.eval.close_paren &&
6020 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6024 bool could_match = locinput < PL_regeol;
6026 /* If it could work, try it. */
6027 if (ST.c1 != CHRTEST_VOID && could_match) {
6028 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6030 could_match = memEQ(locinput,
6035 UTF8SKIP(locinput));
6038 could_match = UCHARAT(locinput) == ST.c1
6039 || UCHARAT(locinput) == ST.c2;
6042 if (ST.c1 == CHRTEST_VOID || could_match) {
6043 CURLY_SETPAREN(ST.paren, ST.count);
6044 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6045 assert(0); /* NOTREACHED */
6050 case CURLY_B_max_fail:
6051 /* failed to find B in a greedy match */
6053 REGCP_UNWIND(ST.cp);
6055 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6058 if (--ST.count < ST.min)
6060 locinput = HOPc(locinput, -1);
6061 goto curly_try_B_max;
6065 case END: /* last op of main pattern */
6068 /* we've just finished A in /(??{A})B/; now continue with B */
6069 st->u.eval.toggle_reg_flags
6070 = cur_eval->u.eval.toggle_reg_flags;
6071 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
6073 st->u.eval.prev_rex = rex_sv; /* inner */
6074 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
6075 rex_sv = cur_eval->u.eval.prev_rex;
6076 SET_reg_curpm(rex_sv);
6077 rex = (struct regexp *)SvANY(rex_sv);
6078 rexi = RXi_GET(rex);
6079 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6081 REGCP_SET(st->u.eval.lastcp);
6083 /* Restore parens of the outer rex without popping the
6085 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
6087 st->u.eval.prev_eval = cur_eval;
6088 cur_eval = cur_eval->u.eval.prev_eval;
6090 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6091 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6092 if ( nochange_depth )
6095 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6096 locinput); /* match B */
6099 if (locinput < reginfo->till) {
6100 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6101 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6103 (long)(locinput - PL_reg_starttry),
6104 (long)(reginfo->till - PL_reg_starttry),
6107 sayNO_SILENT; /* Cannot match: too short. */
6109 sayYES; /* Success! */
6111 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6113 PerlIO_printf(Perl_debug_log,
6114 "%*s %ssubpattern success...%s\n",
6115 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6116 sayYES; /* Success! */
6119 #define ST st->u.ifmatch
6124 case SUSPEND: /* (?>A) */
6126 newstart = locinput;
6129 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6131 goto ifmatch_trivial_fail_test;
6133 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6135 ifmatch_trivial_fail_test:
6137 char * const s = HOPBACKc(locinput, scan->flags);
6142 sw = 1 - cBOOL(ST.wanted);
6146 next = scan + ARG(scan);
6154 newstart = locinput;
6158 ST.logical = logical;
6159 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6161 /* execute body of (?...A) */
6162 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6163 assert(0); /* NOTREACHED */
6166 case IFMATCH_A_fail: /* body of (?...A) failed */
6167 ST.wanted = !ST.wanted;
6170 case IFMATCH_A: /* body of (?...A) succeeded */
6172 sw = cBOOL(ST.wanted);
6174 else if (!ST.wanted)
6177 if (OP(ST.me) != SUSPEND) {
6178 /* restore old position except for (?>...) */
6179 locinput = st->locinput;
6181 scan = ST.me + ARG(ST.me);
6184 continue; /* execute B */
6188 case LONGJMP: /* alternative with many branches compiles to
6189 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6190 next = scan + ARG(scan);
6195 case COMMIT: /* (*COMMIT) */
6196 reginfo->cutpoint = PL_regeol;
6199 case PRUNE: /* (*PRUNE) */
6201 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6202 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6203 assert(0); /* NOTREACHED */
6205 case COMMIT_next_fail:
6209 case OPFAIL: /* (*FAIL) */
6211 assert(0); /* NOTREACHED */
6213 #define ST st->u.mark
6214 case MARKPOINT: /* (*MARK:foo) */
6215 ST.prev_mark = mark_state;
6216 ST.mark_name = sv_commit = sv_yes_mark
6217 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6219 ST.mark_loc = locinput;
6220 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6221 assert(0); /* NOTREACHED */
6223 case MARKPOINT_next:
6224 mark_state = ST.prev_mark;
6226 assert(0); /* NOTREACHED */
6228 case MARKPOINT_next_fail:
6229 if (popmark && sv_eq(ST.mark_name,popmark))
6231 if (ST.mark_loc > startpoint)
6232 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6233 popmark = NULL; /* we found our mark */
6234 sv_commit = ST.mark_name;
6237 PerlIO_printf(Perl_debug_log,
6238 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6239 REPORT_CODE_OFF+depth*2, "",
6240 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6243 mark_state = ST.prev_mark;
6244 sv_yes_mark = mark_state ?
6245 mark_state->u.mark.mark_name : NULL;
6247 assert(0); /* NOTREACHED */
6249 case SKIP: /* (*SKIP) */
6251 /* (*SKIP) : if we fail we cut here*/
6252 ST.mark_name = NULL;
6253 ST.mark_loc = locinput;
6254 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6256 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6257 otherwise do nothing. Meaning we need to scan
6259 regmatch_state *cur = mark_state;
6260 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6263 if ( sv_eq( cur->u.mark.mark_name,
6266 ST.mark_name = find;
6267 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6269 cur = cur->u.mark.prev_mark;
6272 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6275 case SKIP_next_fail:
6277 /* (*CUT:NAME) - Set up to search for the name as we
6278 collapse the stack*/
6279 popmark = ST.mark_name;
6281 /* (*CUT) - No name, we cut here.*/
6282 if (ST.mark_loc > startpoint)
6283 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6284 /* but we set sv_commit to latest mark_name if there
6285 is one so they can test to see how things lead to this
6288 sv_commit=mark_state->u.mark.mark_name;
6292 assert(0); /* NOTREACHED */
6295 case LNBREAK: /* \R */
6296 if ((n=is_LNBREAK_safe(locinput, PL_regeol, utf8_target))) {
6302 #define CASE_CLASS(nAmE) \
6304 if (NEXTCHR_IS_EOS) \
6306 if ((n=is_##nAmE(locinput,utf8_target))) { \
6312 if (NEXTCHR_IS_EOS) \
6314 if ((n=is_##nAmE(locinput,utf8_target))) { \
6317 locinput += UTF8SKIP(locinput); \
6321 CASE_CLASS(VERTWS); /* \v \V */
6322 CASE_CLASS(HORIZWS); /* \h \H */
6326 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6327 PTR2UV(scan), OP(scan));
6328 Perl_croak(aTHX_ "regexp memory corruption");
6330 /* this is a point to jump to in order to increment
6331 * locinput by one character */
6333 assert(!NEXTCHR_IS_EOS);
6335 locinput += PL_utf8skip[nextchr];
6336 /* locinput is allowed to go 1 char off the end, but not 2+ */
6337 if (locinput > PL_regeol)
6346 /* switch break jumps here */
6347 scan = next; /* prepare to execute the next op and ... */
6348 continue; /* ... jump back to the top, reusing st */
6349 assert(0); /* NOTREACHED */
6352 /* push a state that backtracks on success */
6353 st->u.yes.prev_yes_state = yes_state;
6357 /* push a new regex state, then continue at scan */
6359 regmatch_state *newst;
6362 regmatch_state *cur = st;
6363 regmatch_state *curyes = yes_state;
6365 regmatch_slab *slab = PL_regmatch_slab;
6366 for (;curd > -1;cur--,curd--) {
6367 if (cur < SLAB_FIRST(slab)) {
6369 cur = SLAB_LAST(slab);
6371 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6372 REPORT_CODE_OFF + 2 + depth * 2,"",
6373 curd, PL_reg_name[cur->resume_state],
6374 (curyes == cur) ? "yes" : ""
6377 curyes = cur->u.yes.prev_yes_state;
6380 DEBUG_STATE_pp("push")
6383 st->locinput = locinput;
6385 if (newst > SLAB_LAST(PL_regmatch_slab))
6386 newst = S_push_slab(aTHX);
6387 PL_regmatch_state = newst;
6389 locinput = pushinput;
6392 assert(0); /* NOTREACHED */
6397 * We get here only if there's trouble -- normally "case END" is
6398 * the terminating point.
6400 Perl_croak(aTHX_ "corrupted regexp pointers");
6406 /* we have successfully completed a subexpression, but we must now
6407 * pop to the state marked by yes_state and continue from there */
6408 assert(st != yes_state);
6410 while (st != yes_state) {
6412 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6413 PL_regmatch_slab = PL_regmatch_slab->prev;
6414 st = SLAB_LAST(PL_regmatch_slab);
6418 DEBUG_STATE_pp("pop (no final)");
6420 DEBUG_STATE_pp("pop (yes)");
6426 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6427 || yes_state > SLAB_LAST(PL_regmatch_slab))
6429 /* not in this slab, pop slab */
6430 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6431 PL_regmatch_slab = PL_regmatch_slab->prev;
6432 st = SLAB_LAST(PL_regmatch_slab);
6434 depth -= (st - yes_state);
6437 yes_state = st->u.yes.prev_yes_state;
6438 PL_regmatch_state = st;
6441 locinput= st->locinput;
6442 state_num = st->resume_state + no_final;
6443 goto reenter_switch;
6446 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6447 PL_colors[4], PL_colors[5]));
6449 if (PL_reg_state.re_state_eval_setup_done) {
6450 /* each successfully executed (?{...}) block does the equivalent of
6451 * local $^R = do {...}
6452 * When popping the save stack, all these locals would be undone;
6453 * bypass this by setting the outermost saved $^R to the latest
6455 if (oreplsv != GvSV(PL_replgv))
6456 sv_setsv(oreplsv, GvSV(PL_replgv));
6463 PerlIO_printf(Perl_debug_log,
6464 "%*s %sfailed...%s\n",
6465 REPORT_CODE_OFF+depth*2, "",
6466 PL_colors[4], PL_colors[5])
6478 /* there's a previous state to backtrack to */
6480 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6481 PL_regmatch_slab = PL_regmatch_slab->prev;
6482 st = SLAB_LAST(PL_regmatch_slab);
6484 PL_regmatch_state = st;
6485 locinput= st->locinput;
6487 DEBUG_STATE_pp("pop");
6489 if (yes_state == st)
6490 yes_state = st->u.yes.prev_yes_state;
6492 state_num = st->resume_state + 1; /* failure = success + 1 */
6493 goto reenter_switch;
6498 if (rex->intflags & PREGf_VERBARG_SEEN) {
6499 SV *sv_err = get_sv("REGERROR", 1);
6500 SV *sv_mrk = get_sv("REGMARK", 1);
6502 sv_commit = &PL_sv_no;
6504 sv_yes_mark = &PL_sv_yes;
6507 sv_commit = &PL_sv_yes;
6508 sv_yes_mark = &PL_sv_no;
6510 sv_setsv(sv_err, sv_commit);
6511 sv_setsv(sv_mrk, sv_yes_mark);
6515 if (last_pushed_cv) {
6518 PERL_UNUSED_VAR(SP);
6521 /* clean up; in particular, free all slabs above current one */
6522 LEAVE_SCOPE(oldsave);
6524 assert(!result || locinput - PL_bostr >= 0);
6525 return result ? locinput - PL_bostr : -1;
6529 - regrepeat - repeatedly match something simple, report how many
6531 * startposp - pointer a pointer to the start position. This is updated
6532 * to point to the byte following the highest successful
6534 * p - the regnode to be repeatedly matched against.
6535 * max - maximum number of characters to match.
6536 * depth - (for debugging) backtracking depth.
6539 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6544 char *loceol = PL_regeol;
6546 bool utf8_target = PL_reg_match_utf8;
6549 PERL_UNUSED_ARG(depth);
6552 PERL_ARGS_ASSERT_REGREPEAT;
6555 if (max == REG_INFTY)
6557 else if (max < loceol - scan)
6558 loceol = scan + max;
6563 while (scan < loceol && hardcount < max && *scan != '\n') {
6564 scan += UTF8SKIP(scan);
6568 while (scan < loceol && *scan != '\n')
6575 while (scan < loceol && hardcount < max) {
6576 scan += UTF8SKIP(scan);
6587 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6591 /* Can use a simple loop if the pattern char to match on is invariant
6592 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
6593 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
6594 * true iff it doesn't matter if the argument is in UTF-8 or not */
6595 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
6596 while (scan < loceol && UCHARAT(scan) == c) {
6600 else if (UTF_PATTERN) {
6602 STRLEN scan_char_len;
6605 /* When both target and pattern are UTF-8, we have to do s
6607 while (hardcount < max
6608 && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
6609 && scan_char_len <= STR_LEN(p)
6610 && memEQ(scan, STRING(p), scan_char_len))
6612 scan += scan_char_len;
6616 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
6618 /* Target isn't utf8; convert the character in the UTF-8
6619 * pattern to non-UTF8, and do a simple loop */
6620 c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
6621 while (scan < loceol && UCHARAT(scan) == c) {
6624 } /* else pattern char is above Latin1, can't possibly match the
6629 /* Here, the string must be utf8; pattern isn't, and <c> is
6630 * different in utf8 than not, so can't compare them directly.
6631 * Outside the loop, find the two utf8 bytes that represent c, and
6632 * then look for those in sequence in the utf8 string */
6633 U8 high = UTF8_TWO_BYTE_HI(c);
6634 U8 low = UTF8_TWO_BYTE_LO(c);
6637 while (hardcount < max
6638 && scan + 1 < loceol
6639 && UCHARAT(scan) == high
6640 && UCHARAT(scan + 1) == low)
6649 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6653 PL_reg_flags |= RF_tainted;
6654 utf8_flags = FOLDEQ_UTF8_LOCALE;
6662 case EXACTFU_TRICKYFOLD:
6664 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6668 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
6670 assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1);
6672 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) {
6673 if (c1 == CHRTEST_VOID) {
6674 /* Use full Unicode fold matching */
6675 char *tmpeol = loceol;
6676 STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1;
6677 while (hardcount < max
6678 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6679 STRING(p), NULL, pat_len,
6680 cBOOL(UTF_PATTERN), utf8_flags))
6687 else if (utf8_target) {
6689 while (hardcount < max
6690 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
6692 scan += UTF8SKIP(scan);
6697 while (hardcount < max
6698 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
6699 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
6701 scan += UTF8SKIP(scan);
6706 else if (c1 == c2) {
6707 while (scan < loceol && UCHARAT(scan) == c1) {
6712 while (scan < loceol &&
6713 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
6725 inclasslen = loceol - scan;
6726 while (hardcount < max
6727 && ((inclasslen = loceol - scan) > 0)
6728 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6734 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6742 LOAD_UTF8_CHARCLASS_ALNUM();
6743 while (hardcount < max && scan < loceol &&
6744 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6746 scan += UTF8SKIP(scan);
6750 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6758 while (scan < loceol && isALNUM((U8) *scan)) {
6763 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6768 PL_reg_flags |= RF_tainted;
6771 while (hardcount < max && scan < loceol &&
6772 isALNUM_LC_utf8((U8*)scan)) {
6773 scan += UTF8SKIP(scan);
6777 while (scan < loceol && isALNUM_LC(*scan))
6787 LOAD_UTF8_CHARCLASS_ALNUM();
6788 while (hardcount < max && scan < loceol &&
6789 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6791 scan += UTF8SKIP(scan);
6795 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6802 goto utf8_Nwordchar;
6803 while (scan < loceol && ! isALNUM((U8) *scan)) {
6809 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6815 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6816 scan += UTF8SKIP(scan);
6820 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6827 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6828 scan += UTF8SKIP(scan);
6832 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6838 PL_reg_flags |= RF_tainted;
6841 while (hardcount < max && scan < loceol &&
6842 !isALNUM_LC_utf8((U8*)scan)) {
6843 scan += UTF8SKIP(scan);
6847 while (scan < loceol && !isALNUM_LC(*scan))
6857 LOAD_UTF8_CHARCLASS_SPACE();
6858 while (hardcount < max && scan < loceol &&
6860 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6862 scan += UTF8SKIP(scan);
6868 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6877 while (scan < loceol && isSPACE((U8) *scan)) {
6882 while (scan < loceol && isSPACE_A((U8) *scan)) {
6887 PL_reg_flags |= RF_tainted;
6890 while (hardcount < max && scan < loceol &&
6891 isSPACE_LC_utf8((U8*)scan)) {
6892 scan += UTF8SKIP(scan);
6896 while (scan < loceol && isSPACE_LC(*scan))
6906 LOAD_UTF8_CHARCLASS_SPACE();
6907 while (hardcount < max && scan < loceol &&
6909 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6911 scan += UTF8SKIP(scan);
6917 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6926 while (scan < loceol && ! isSPACE((U8) *scan)) {
6932 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6933 scan += UTF8SKIP(scan);
6937 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6943 PL_reg_flags |= RF_tainted;
6946 while (hardcount < max && scan < loceol &&
6947 !isSPACE_LC_utf8((U8*)scan)) {
6948 scan += UTF8SKIP(scan);
6952 while (scan < loceol && !isSPACE_LC(*scan))
6959 LOAD_UTF8_CHARCLASS_DIGIT();
6960 while (hardcount < max && scan < loceol &&
6961 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6962 scan += UTF8SKIP(scan);
6966 while (scan < loceol && isDIGIT(*scan))
6971 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6976 PL_reg_flags |= RF_tainted;
6979 while (hardcount < max && scan < loceol &&
6980 isDIGIT_LC_utf8((U8*)scan)) {
6981 scan += UTF8SKIP(scan);
6985 while (scan < loceol && isDIGIT_LC(*scan))
6992 LOAD_UTF8_CHARCLASS_DIGIT();
6993 while (hardcount < max && scan < loceol &&
6994 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6995 scan += UTF8SKIP(scan);
6999 while (scan < loceol && !isDIGIT(*scan))
7005 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7006 scan += UTF8SKIP(scan);
7010 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
7016 PL_reg_flags |= RF_tainted;
7019 while (hardcount < max && scan < loceol &&
7020 !isDIGIT_LC_utf8((U8*)scan)) {
7021 scan += UTF8SKIP(scan);
7025 while (scan < loceol && !isDIGIT_LC(*scan))
7030 Perl_croak(aTHX_ "panic: regrepeat() should not be called with non-simple: LNBREAK");
7031 assert(0); /* NOTREACHED */
7035 while (hardcount < max && scan < loceol &&
7036 (c=is_HORIZWS_utf8_safe(scan, loceol)))
7042 while (scan < loceol && is_HORIZWS_latin1_safe(scan, loceol))
7049 while (hardcount < max && scan < loceol &&
7050 !is_HORIZWS_utf8_safe(scan, loceol))
7052 scan += UTF8SKIP(scan);
7056 while (scan < loceol && !is_HORIZWS_latin1_safe(scan, loceol))
7064 while (hardcount < max && scan < loceol &&
7065 (c=is_VERTWS_utf8_safe(scan, loceol)))
7071 while (scan < loceol && is_VERTWS_latin1_safe(scan, loceol))
7079 while (hardcount < max && scan < loceol &&
7080 !is_VERTWS_utf8_safe(scan, loceol))
7082 scan += UTF8SKIP(scan);
7086 while (scan < loceol && !is_VERTWS_latin1_safe(scan, loceol))
7092 default: /* Called on something of 0 width. */
7093 break; /* So match right here or not at all. */
7099 c = scan - *startposp;
7103 GET_RE_DEBUG_FLAGS_DECL;
7105 SV * const prop = sv_newmortal();
7106 regprop(prog, prop, p);
7107 PerlIO_printf(Perl_debug_log,
7108 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7109 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7117 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7119 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7120 create a copy so that changes the caller makes won't change the shared one.
7121 If <altsvp> is non-null, will return NULL in it, for back-compat.
7124 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7126 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7132 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp));
7137 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp)
7139 /* Returns the swash for the input 'node' in the regex 'prog'.
7140 * If <doinit> is true, will attempt to create the swash if not already
7142 * If <listsvp> is non-null, will return the swash initialization string in
7144 * Tied intimately to how regcomp.c sets up the data structure */
7151 RXi_GET_DECL(prog,progi);
7152 const struct reg_data * const data = prog ? progi->data : NULL;
7154 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
7156 assert(ANYOF_NONBITMAP(node));
7158 if (data && data->count) {
7159 const U32 n = ARG(node);
7161 if (data->what[n] == 's') {
7162 SV * const rv = MUTABLE_SV(data->data[n]);
7163 AV * const av = MUTABLE_AV(SvRV(rv));
7164 SV **const ary = AvARRAY(av);
7165 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7167 si = *ary; /* ary[0] = the string to initialize the swash with */
7169 /* Elements 2 and 3 are either both present or both absent. [2] is
7170 * any inversion list generated at compile time; [3] indicates if
7171 * that inversion list has any user-defined properties in it. */
7172 if (av_len(av) >= 2) {
7175 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7182 /* Element [1] is reserved for the set-up swash. If already there,
7183 * return it; if not, create it and store it there */
7184 if (SvROK(ary[1])) {
7187 else if (si && doinit) {
7189 sw = _core_swash_init("utf8", /* the utf8 package */
7193 0, /* not from tr/// */
7196 (void)av_store(av, 1, sw);
7202 SV* matches_string = newSVpvn("", 0);
7204 /* Use the swash, if any, which has to have incorporated into it all
7206 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7207 && (si && si != &PL_sv_undef))
7210 /* If no swash, use the input initialization string, if available */
7211 sv_catsv(matches_string, si);
7214 /* Add the inversion list to whatever we have. This may have come from
7215 * the swash, or from an input parameter */
7217 sv_catsv(matches_string, _invlist_contents(invlist));
7219 *listsvp = matches_string;
7226 - reginclass - determine if a character falls into a character class
7228 n is the ANYOF regnode
7229 p is the target string
7230 lenp is pointer to the maximum number of bytes of how far to go in p
7231 (This is assumed wthout checking to always be at least the current
7233 utf8_target tells whether p is in UTF-8.
7235 Returns true if matched; false otherwise. If lenp is not NULL, on return
7236 from a successful match, the value it points to will be updated to how many
7237 bytes in p were matched. If there was no match, the value is undefined,
7238 possibly changed from the input.
7240 Note that this can be a synthetic start class, a combination of various
7241 nodes, so things you think might be mutually exclusive, such as locale,
7242 aren't. It can match both locale and non-locale
7247 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
7250 const char flags = ANYOF_FLAGS(n);
7256 PERL_ARGS_ASSERT_REGINCLASS;
7258 /* If c is not already the code point, get it */
7259 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
7260 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
7261 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7262 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7263 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7264 * UTF8_ALLOW_FFFF */
7265 if (c_len == (STRLEN)-1)
7266 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7272 /* Use passed in max length, or one character if none passed in or less
7273 * than one character. And assume will match just one character. This is
7274 * overwritten later if matched more. */
7276 maxlen = (*lenp > c_len) ? *lenp : c_len;
7284 /* If this character is potentially in the bitmap, check it */
7286 if (ANYOF_BITMAP_TEST(n, c))
7288 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
7294 else if (flags & ANYOF_LOCALE) {
7295 PL_reg_flags |= RF_tainted;
7297 if ((flags & ANYOF_LOC_FOLD)
7298 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
7302 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
7303 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
7304 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
7305 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
7306 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
7307 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
7308 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
7309 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
7310 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
7311 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
7312 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
7313 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
7314 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
7315 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
7316 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
7317 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
7318 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
7319 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
7320 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
7321 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
7322 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
7323 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
7324 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
7325 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
7326 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
7327 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
7328 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
7329 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
7330 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
7331 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
7332 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
7333 ) /* How's that for a conditional? */
7340 /* If the bitmap didn't (or couldn't) match, and something outside the
7341 * bitmap could match, try that. Locale nodes specify completely the
7342 * behavior of code points in the bit map (otherwise, a utf8 target would
7343 * cause them to be treated as Unicode and not locale), except in
7344 * the very unlikely event when this node is a synthetic start class, which
7345 * could be a combination of locale and non-locale nodes. So allow locale
7346 * to match for the synthetic start class, which will give a false
7347 * positive that will be resolved when the match is done again as not part
7348 * of the synthetic start class */
7350 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
7351 match = TRUE; /* Everything above 255 matches */
7353 else if (ANYOF_NONBITMAP(n)
7354 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
7357 || (! (flags & ANYOF_LOCALE))
7358 || (flags & ANYOF_IS_SYNTHETIC)))))
7360 SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
7365 } else { /* Convert to utf8 */
7367 utf8_p = bytes_to_utf8(p, &len);
7370 if (swash_fetch(sw, utf8_p, TRUE)) {
7374 /* If we allocated a string above, free it */
7375 if (! utf8_target) Safefree(utf8_p);
7379 if (UNICODE_IS_SUPER(c)
7380 && (flags & ANYOF_WARN_SUPER)
7381 && ckWARN_d(WARN_NON_UNICODE))
7383 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7384 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7388 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7389 return cBOOL(flags & ANYOF_INVERT) ^ match;
7393 S_reghop3(U8 *s, I32 off, const U8* lim)
7395 /* return the position 'off' UTF-8 characters away from 's', forward if
7396 * 'off' >= 0, backwards if negative. But don't go outside of position
7397 * 'lim', which better be < s if off < 0 */
7401 PERL_ARGS_ASSERT_REGHOP3;
7404 while (off-- && s < lim) {
7405 /* XXX could check well-formedness here */
7410 while (off++ && s > lim) {
7412 if (UTF8_IS_CONTINUED(*s)) {
7413 while (s > lim && UTF8_IS_CONTINUATION(*s))
7416 /* XXX could check well-formedness here */
7423 /* there are a bunch of places where we use two reghop3's that should
7424 be replaced with this routine. but since thats not done yet
7425 we ifdef it out - dmq
7428 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7432 PERL_ARGS_ASSERT_REGHOP4;
7435 while (off-- && s < rlim) {
7436 /* XXX could check well-formedness here */
7441 while (off++ && s > llim) {
7443 if (UTF8_IS_CONTINUED(*s)) {
7444 while (s > llim && UTF8_IS_CONTINUATION(*s))
7447 /* XXX could check well-formedness here */
7455 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7459 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7462 while (off-- && s < lim) {
7463 /* XXX could check well-formedness here */
7470 while (off++ && s > lim) {
7472 if (UTF8_IS_CONTINUED(*s)) {
7473 while (s > lim && UTF8_IS_CONTINUATION(*s))
7476 /* XXX could check well-formedness here */
7485 restore_pos(pTHX_ void *arg)
7488 regexp * const rex = (regexp *)arg;
7489 if (PL_reg_state.re_state_eval_setup_done) {
7490 if (PL_reg_oldsaved) {
7491 rex->subbeg = PL_reg_oldsaved;
7492 rex->sublen = PL_reg_oldsavedlen;
7493 rex->suboffset = PL_reg_oldsavedoffset;
7494 rex->subcoffset = PL_reg_oldsavedcoffset;
7495 #ifdef PERL_OLD_COPY_ON_WRITE
7496 rex->saved_copy = PL_nrs;
7498 RXp_MATCH_COPIED_on(rex);
7500 PL_reg_magic->mg_len = PL_reg_oldpos;
7501 PL_reg_state.re_state_eval_setup_done = FALSE;
7502 PL_curpm = PL_reg_oldcurpm;
7507 S_to_utf8_substr(pTHX_ register regexp *prog)
7509 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
7510 * on the converted value */
7514 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7517 if (prog->substrs->data[i].substr
7518 && !prog->substrs->data[i].utf8_substr) {
7519 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7520 prog->substrs->data[i].utf8_substr = sv;
7521 sv_utf8_upgrade(sv);
7522 if (SvVALID(prog->substrs->data[i].substr)) {
7523 if (SvTAIL(prog->substrs->data[i].substr)) {
7524 /* Trim the trailing \n that fbm_compile added last
7526 SvCUR_set(sv, SvCUR(sv) - 1);
7527 /* Whilst this makes the SV technically "invalid" (as its
7528 buffer is no longer followed by "\0") when fbm_compile()
7529 adds the "\n" back, a "\0" is restored. */
7530 fbm_compile(sv, FBMcf_TAIL);
7534 if (prog->substrs->data[i].substr == prog->check_substr)
7535 prog->check_utf8 = sv;
7541 S_to_byte_substr(pTHX_ register regexp *prog)
7543 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
7544 * on the converted value; returns FALSE if can't be converted. */
7549 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7552 if (prog->substrs->data[i].utf8_substr
7553 && !prog->substrs->data[i].substr) {
7554 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7555 if (! sv_utf8_downgrade(sv, TRUE)) {
7558 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7559 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7560 /* Trim the trailing \n that fbm_compile added last
7562 SvCUR_set(sv, SvCUR(sv) - 1);
7563 fbm_compile(sv, FBMcf_TAIL);
7567 prog->substrs->data[i].substr = sv;
7568 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7569 prog->check_substr = sv;
7576 /* These constants are for finding GCB=LV and GCB=LVT. These are for the
7577 * pre-composed Hangul syllables, which are all in a contiguous block and
7578 * arranged there in such a way so as to facilitate alorithmic determination of
7579 * their characteristics. As such, they don't need a swash, but can be
7580 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7582 #define SBASE 0xAC00 /* Start of block */
7583 #define SCount 11172 /* Length of block */
7586 #if 0 /* This routine is not currently used */
7587 PERL_STATIC_INLINE bool
7588 S_is_utf8_X_LV(pTHX_ const U8 *p)
7590 /* Unlike most other similarly named routines here, this does not create a
7591 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7595 UV cp = valid_utf8_to_uvchr(p, NULL);
7597 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7599 /* The earliest Unicode releases did not have these precomposed Hangul
7600 * syllables. Set to point to undef in that case, so will return false on
7602 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7603 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7604 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7605 SvREFCNT_dec(PL_utf8_X_LV);
7606 PL_utf8_X_LV = &PL_sv_undef;
7610 return (PL_utf8_X_LV != &PL_sv_undef
7611 && cp >= SBASE && cp < SBASE + SCount
7612 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7616 PERL_STATIC_INLINE bool
7617 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7619 /* Unlike most other similarly named routines here, this does not create a
7620 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7624 UV cp = valid_utf8_to_uvchr(p, NULL);
7626 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7628 /* The earliest Unicode releases did not have these precomposed Hangul
7629 * syllables. Set to point to undef in that case, so will return false on
7631 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7632 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7633 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7634 SvREFCNT_dec(PL_utf8_X_LVT);
7635 PL_utf8_X_LVT = &PL_sv_undef;
7639 return (PL_utf8_X_LVT != &PL_sv_undef
7640 && cp >= SBASE && cp < SBASE + SCount
7641 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7646 * c-indentation-style: bsd
7648 * indent-tabs-mode: nil
7651 * ex: set ts=8 sts=4 sw=4 et: