5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
78 #ifdef PERL_IN_XSUB_RE
84 #include "inline_invlist.c"
85 #include "unicode_constants.h"
87 #define RF_tainted 1 /* tainted information used? e.g. locale */
88 #define RF_warned 2 /* warned about big count? */
90 #define RF_utf8 8 /* Pattern contains multibyte chars? */
92 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
98 /* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass
99 * call if there are no complications: i.e., if everything matchable is
100 * straight forward in the bitmap */
101 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
102 : ANYOF_BITMAP_TEST(p,*(c)))
108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
111 #define HOPc(pos,off) \
112 (char *)(PL_reg_match_utf8 \
113 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
115 #define HOPBACKc(pos, off) \
116 (char*)(PL_reg_match_utf8\
117 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
118 : (pos - off >= PL_bostr) \
122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
125 /* these are unrolled below in the CCC_TRY_XXX defined */
126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
127 if (!CAT2(PL_utf8_,class)) { \
129 ENTER; save_re_context(); \
130 ok=CAT2(is_utf8_,class)((const U8*)str); \
131 PERL_UNUSED_VAR(ok); \
132 assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END
133 /* Doesn't do an assert to verify that is correct */
134 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
135 if (!CAT2(PL_utf8_,class)) { \
136 bool throw_away PERL_UNUSED_DECL; \
137 ENTER; save_re_context(); \
138 throw_away = CAT2(is_utf8_,class)((const U8*)" "); \
141 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
142 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
143 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
145 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
146 /* No asserts are done for some of these, in case called on a */ \
147 /* Unicode version in which they map to nothing */ \
148 LOAD_UTF8_CHARCLASS(X_regular_begin, HYPHEN_UTF8); \
149 LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \
151 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
153 /* The actual code for CCC_TRY, which uses several variables from the routine
154 * it's callable from. It is designed to be the bulk of a case statement.
155 * FUNC is the macro or function to call on non-utf8 targets that indicate if
156 * nextchr matches the class.
157 * UTF8_TEST is the whole test string to use for utf8 targets
158 * LOAD is what to use to test, and if not present to load in the swash for the
160 * POS_OR_NEG is either empty or ! to complement the results of FUNC or
162 * The logic is: Fail if we're at the end-of-string; otherwise if the target is
163 * utf8 and a variant, load the swash if necessary and test using the utf8
164 * test. Advance to the next character if test is ok, otherwise fail; If not
165 * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it
166 * fails, or advance to the next character */
168 #define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \
169 if (locinput >= PL_regeol) { \
172 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
173 LOAD_UTF8_CHARCLASS(CLASS, STR); \
174 if (POS_OR_NEG (UTF8_TEST)) { \
177 locinput += PL_utf8skip[nextchr]; \
178 nextchr = UCHARAT(locinput); \
181 if (POS_OR_NEG (FUNC(nextchr))) { \
184 nextchr = UCHARAT(++locinput); \
187 /* Handle the non-locale cases for a character class and its complement. It
188 * calls _CCC_TRY_CODE with a ! to complement the test for the character class.
189 * This is because that code fails when the test succeeds, so we want to have
190 * the test fail so that the code succeeds. The swash is stored in a
191 * predictable PL_ place */
192 #define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \
195 _CCC_TRY_CODE( !, FUNC, \
196 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
197 (U8*)locinput, TRUE)), \
200 _CCC_TRY_CODE( PLACEHOLDER , FUNC, \
201 cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \
202 (U8*)locinput, TRUE)), \
205 /* Generate the case statements for both locale and non-locale character
206 * classes in regmatch for classes that don't have special unicode semantics.
207 * Locales don't use an immediate swash, but an intermediary special locale
208 * function that is called on the pointer to the current place in the input
209 * string. That function will resolve to needing the same swash. One might
210 * think that because we don't know what the locale will match, we shouldn't
211 * check with the swash loading function that it loaded properly; ie, that we
212 * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the
213 * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is
215 #define CCC_TRY(NAME, NNAME, FUNC, \
216 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
217 NAMEA, NNAMEA, FUNCA, \
220 PL_reg_flags |= RF_tainted; \
221 _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \
223 PL_reg_flags |= RF_tainted; \
224 _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \
227 if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \
230 /* Matched a utf8-invariant, so don't have to worry about utf8 */ \
231 nextchr = UCHARAT(++locinput); \
234 if (locinput >= PL_regeol || FUNCA(nextchr)) { \
238 locinput += PL_utf8skip[nextchr]; \
239 nextchr = UCHARAT(locinput); \
242 nextchr = UCHARAT(++locinput); \
245 /* Generate the non-locale cases */ \
246 _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR)
248 /* This is like CCC_TRY, but has an extra set of parameters for generating case
249 * statements to handle separate Unicode semantics nodes */
250 #define CCC_TRY_U(NAME, NNAME, FUNC, \
251 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
252 NAMEU, NNAMEU, FUNCU, \
253 NAMEA, NNAMEA, FUNCA, \
255 CCC_TRY(NAME, NNAME, FUNC, \
256 NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \
257 NAMEA, NNAMEA, FUNCA, \
259 _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR)
261 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
263 /* for use after a quantifier and before an EXACT-like node -- japhy */
264 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
266 * NOTE that *nothing* that affects backtracking should be in here, specifically
267 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
268 * node that is in between two EXACT like nodes when ascertaining what the required
269 * "follow" character is. This should probably be moved to regex compile time
270 * although it may be done at run time beause of the REF possibility - more
271 * investigation required. -- demerphq
273 #define JUMPABLE(rn) ( \
275 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
277 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
278 OP(rn) == PLUS || OP(rn) == MINMOD || \
280 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
282 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
284 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
287 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
288 we don't need this definition. */
289 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
290 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
291 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
294 /* ... so we use this as its faster. */
295 #define IS_TEXT(rn) ( OP(rn)==EXACT )
296 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
297 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
303 Search for mandatory following text node; for lookahead, the text must
304 follow but for lookbehind (rn->flags != 0) we skip to the next step.
306 #define FIND_NEXT_IMPT(rn) STMT_START { \
307 while (JUMPABLE(rn)) { \
308 const OPCODE type = OP(rn); \
309 if (type == SUSPEND || PL_regkind[type] == CURLY) \
310 rn = NEXTOPER(NEXTOPER(rn)); \
311 else if (type == PLUS) \
313 else if (type == IFMATCH) \
314 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
315 else rn += NEXT_OFF(rn); \
320 static void restore_pos(pTHX_ void *arg);
322 #define REGCP_PAREN_ELEMS 3
323 #define REGCP_OTHER_ELEMS 3
324 #define REGCP_FRAME_ELEMS 1
325 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
326 * are needed for the regexp context stack bookkeeping. */
329 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor)
332 const int retval = PL_savestack_ix;
333 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
334 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
335 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
337 GET_RE_DEBUG_FLAGS_DECL;
339 PERL_ARGS_ASSERT_REGCPPUSH;
341 if (paren_elems_to_push < 0)
342 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0",
343 paren_elems_to_push);
345 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
346 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
347 " out of range (%lu-%ld)",
348 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
350 SSGROW(total_elems + REGCP_FRAME_ELEMS);
353 if ((int)PL_regsize > (int)parenfloor)
354 PerlIO_printf(Perl_debug_log,
355 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
360 for (p = parenfloor+1; p <= (I32)PL_regsize; p++) {
361 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
362 SSPUSHINT(rex->offs[p].end);
363 SSPUSHINT(rex->offs[p].start);
364 SSPUSHINT(rex->offs[p].start_tmp);
365 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
366 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
368 (IV)rex->offs[p].start,
369 (IV)rex->offs[p].start_tmp,
373 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
374 SSPUSHINT(PL_regsize);
375 SSPUSHINT(rex->lastparen);
376 SSPUSHINT(rex->lastcloseparen);
377 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
382 /* These are needed since we do not localize EVAL nodes: */
383 #define REGCP_SET(cp) \
385 PerlIO_printf(Perl_debug_log, \
386 " Setting an EVAL scope, savestack=%"IVdf"\n", \
387 (IV)PL_savestack_ix)); \
390 #define REGCP_UNWIND(cp) \
392 if (cp != PL_savestack_ix) \
393 PerlIO_printf(Perl_debug_log, \
394 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
395 (IV)(cp), (IV)PL_savestack_ix)); \
398 #define UNWIND_PAREN(lp, lcp) \
399 for (n = rex->lastparen; n > lp; n--) \
400 rex->offs[n].end = -1; \
401 rex->lastparen = n; \
402 rex->lastcloseparen = lcp;
406 S_regcppop(pTHX_ regexp *rex)
411 GET_RE_DEBUG_FLAGS_DECL;
413 PERL_ARGS_ASSERT_REGCPPOP;
415 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
417 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
418 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
419 rex->lastcloseparen = SSPOPINT;
420 rex->lastparen = SSPOPINT;
421 PL_regsize = SSPOPINT;
423 i -= REGCP_OTHER_ELEMS;
424 /* Now restore the parentheses context. */
426 if (i || rex->lastparen + 1 <= rex->nparens)
427 PerlIO_printf(Perl_debug_log,
428 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
434 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
436 rex->offs[paren].start_tmp = SSPOPINT;
437 rex->offs[paren].start = SSPOPINT;
439 if (paren <= rex->lastparen)
440 rex->offs[paren].end = tmps;
441 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
442 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
444 (IV)rex->offs[paren].start,
445 (IV)rex->offs[paren].start_tmp,
446 (IV)rex->offs[paren].end,
447 (paren > rex->lastparen ? "(skipped)" : ""));
452 /* It would seem that the similar code in regtry()
453 * already takes care of this, and in fact it is in
454 * a better location to since this code can #if 0-ed out
455 * but the code in regtry() is needed or otherwise tests
456 * requiring null fields (pat.t#187 and split.t#{13,14}
457 * (as of patchlevel 7877) will fail. Then again,
458 * this code seems to be necessary or otherwise
459 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
460 * --jhi updated by dapm */
461 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
463 rex->offs[i].start = -1;
464 rex->offs[i].end = -1;
465 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
466 " \\%"UVuf": %s ..-1 undeffing\n",
468 (i > PL_regsize) ? "-1" : " "
474 /* restore the parens and associated vars at savestack position ix,
475 * but without popping the stack */
478 S_regcp_restore(pTHX_ regexp *rex, I32 ix)
480 I32 tmpix = PL_savestack_ix;
481 PL_savestack_ix = ix;
483 PL_savestack_ix = tmpix;
486 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
489 * pregexec and friends
492 #ifndef PERL_IN_XSUB_RE
494 - pregexec - match a regexp against a string
497 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
498 char *strbeg, I32 minend, SV *screamer, U32 nosave)
499 /* stringarg: the point in the string at which to begin matching */
500 /* strend: pointer to null at end of string */
501 /* strbeg: real beginning of string */
502 /* minend: end of match must be >= minend bytes after stringarg. */
503 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
504 * itself is accessed via the pointers above */
505 /* nosave: For optimizations. */
507 PERL_ARGS_ASSERT_PREGEXEC;
510 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
511 nosave ? 0 : REXEC_COPY_STR);
516 * Need to implement the following flags for reg_anch:
518 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
520 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
521 * INTUIT_AUTORITATIVE_ML
522 * INTUIT_ONCE_NOML - Intuit can match in one location only.
525 * Another flag for this function: SECOND_TIME (so that float substrs
526 * with giant delta may be not rechecked).
529 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
531 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
532 Otherwise, only SvCUR(sv) is used to get strbeg. */
534 /* XXXX We assume that strpos is strbeg unless sv. */
536 /* XXXX Some places assume that there is a fixed substring.
537 An update may be needed if optimizer marks as "INTUITable"
538 RExen without fixed substrings. Similarly, it is assumed that
539 lengths of all the strings are no more than minlen, thus they
540 cannot come from lookahead.
541 (Or minlen should take into account lookahead.)
542 NOTE: Some of this comment is not correct. minlen does now take account
543 of lookahead/behind. Further research is required. -- demerphq
547 /* A failure to find a constant substring means that there is no need to make
548 an expensive call to REx engine, thus we celebrate a failure. Similarly,
549 finding a substring too deep into the string means that less calls to
550 regtry() should be needed.
552 REx compiler's optimizer found 4 possible hints:
553 a) Anchored substring;
555 c) Whether we are anchored (beginning-of-line or \G);
556 d) First node (of those at offset 0) which may distinguish positions;
557 We use a)b)d) and multiline-part of c), and try to find a position in the
558 string which does not contradict any of them.
561 /* Most of decisions we do here should have been done at compile time.
562 The nodes of the REx which we used for the search should have been
563 deleted from the finite automaton. */
566 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
567 char *strend, const U32 flags, re_scream_pos_data *data)
570 struct regexp *const prog = (struct regexp *)SvANY(rx);
572 /* Should be nonnegative! */
578 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
580 char *other_last = NULL; /* other substr checked before this */
581 char *check_at = NULL; /* check substr found at this pos */
582 char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/
583 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
584 RXi_GET_DECL(prog,progi);
586 const char * const i_strpos = strpos;
588 GET_RE_DEBUG_FLAGS_DECL;
590 PERL_ARGS_ASSERT_RE_INTUIT_START;
591 PERL_UNUSED_ARG(flags);
592 PERL_UNUSED_ARG(data);
594 RX_MATCH_UTF8_set(rx,utf8_target);
597 PL_reg_flags |= RF_utf8;
600 debug_start_match(rx, utf8_target, strpos, strend,
601 sv ? "Guessing start of match in sv for"
602 : "Guessing start of match in string for");
605 /* CHR_DIST() would be more correct here but it makes things slow. */
606 if (prog->minlen > strend - strpos) {
607 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
608 "String too short... [re_intuit_start]\n"));
612 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
615 if (!prog->check_utf8 && prog->check_substr)
616 to_utf8_substr(prog);
617 check = prog->check_utf8;
619 if (!prog->check_substr && prog->check_utf8)
620 to_byte_substr(prog);
621 check = prog->check_substr;
623 if (check == &PL_sv_undef) {
624 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
625 "Non-utf8 string cannot match utf8 check string\n"));
628 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
629 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
630 || ( (prog->extflags & RXf_ANCH_BOL)
631 && !multiline ) ); /* Check after \n? */
634 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
635 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
636 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
638 && (strpos != strbeg)) {
639 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
642 if (prog->check_offset_min == prog->check_offset_max &&
643 !(prog->extflags & RXf_CANY_SEEN)) {
644 /* Substring at constant offset from beg-of-str... */
647 s = HOP3c(strpos, prog->check_offset_min, strend);
650 slen = SvCUR(check); /* >= 1 */
652 if ( strend - s > slen || strend - s < slen - 1
653 || (strend - s == slen && strend[-1] != '\n')) {
654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
657 /* Now should match s[0..slen-2] */
659 if (slen && (*SvPVX_const(check) != *s
661 && memNE(SvPVX_const(check), s, slen)))) {
663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
667 else if (*SvPVX_const(check) != *s
668 || ((slen = SvCUR(check)) > 1
669 && memNE(SvPVX_const(check), s, slen)))
672 goto success_at_start;
675 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
677 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
678 end_shift = prog->check_end_shift;
681 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
682 - (SvTAIL(check) != 0);
683 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
685 if (end_shift < eshift)
689 else { /* Can match at random position */
692 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
693 end_shift = prog->check_end_shift;
695 /* end shift should be non negative here */
698 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
700 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
701 (IV)end_shift, RX_PRECOMP(prog));
705 /* Find a possible match in the region s..strend by looking for
706 the "check" substring in the region corrected by start/end_shift. */
709 I32 srch_start_shift = start_shift;
710 I32 srch_end_shift = end_shift;
713 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
714 srch_end_shift -= ((strbeg - s) - srch_start_shift);
715 srch_start_shift = strbeg - s;
717 DEBUG_OPTIMISE_MORE_r({
718 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
719 (IV)prog->check_offset_min,
720 (IV)srch_start_shift,
722 (IV)prog->check_end_shift);
725 if (prog->extflags & RXf_CANY_SEEN) {
726 start_point= (U8*)(s + srch_start_shift);
727 end_point= (U8*)(strend - srch_end_shift);
729 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
730 end_point= HOP3(strend, -srch_end_shift, strbeg);
732 DEBUG_OPTIMISE_MORE_r({
733 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
734 (int)(end_point - start_point),
735 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
739 s = fbm_instr( start_point, end_point,
740 check, multiline ? FBMrf_MULTILINE : 0);
742 /* Update the count-of-usability, remove useless subpatterns,
746 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
747 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
748 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
749 (s ? "Found" : "Did not find"),
750 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
751 ? "anchored" : "floating"),
754 (s ? " at offset " : "...\n") );
759 /* Finish the diagnostic message */
760 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
762 /* XXX dmq: first branch is for positive lookbehind...
763 Our check string is offset from the beginning of the pattern.
764 So we need to do any stclass tests offset forward from that
773 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
774 Start with the other substr.
775 XXXX no SCREAM optimization yet - and a very coarse implementation
776 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
777 *always* match. Probably should be marked during compile...
778 Probably it is right to do no SCREAM here...
781 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
782 : (prog->float_substr && prog->anchored_substr))
784 /* Take into account the "other" substring. */
785 /* XXXX May be hopelessly wrong for UTF... */
788 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
791 char * const last = HOP3c(s, -start_shift, strbeg);
793 char * const saved_s = s;
796 t = s - prog->check_offset_max;
797 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
799 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
804 t = HOP3c(t, prog->anchored_offset, strend);
805 if (t < other_last) /* These positions already checked */
807 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
810 /* XXXX It is not documented what units *_offsets are in.
811 We assume bytes, but this is clearly wrong.
812 Meaning this code needs to be carefully reviewed for errors.
816 /* On end-of-str: see comment below. */
817 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
818 if (must == &PL_sv_undef) {
820 DEBUG_r(must = prog->anchored_utf8); /* for debug */
825 HOP3(HOP3(last1, prog->anchored_offset, strend)
826 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
828 multiline ? FBMrf_MULTILINE : 0
831 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
832 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
833 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
834 (s ? "Found" : "Contradicts"),
835 quoted, RE_SV_TAIL(must));
840 if (last1 >= last2) {
841 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
842 ", giving up...\n"));
845 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
846 ", trying floating at offset %ld...\n",
847 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
848 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
849 s = HOP3c(last, 1, strend);
853 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
854 (long)(s - i_strpos)));
855 t = HOP3c(s, -prog->anchored_offset, strbeg);
856 other_last = HOP3c(s, 1, strend);
864 else { /* Take into account the floating substring. */
866 char * const saved_s = s;
869 t = HOP3c(s, -start_shift, strbeg);
871 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
872 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
873 last = HOP3c(t, prog->float_max_offset, strend);
874 s = HOP3c(t, prog->float_min_offset, strend);
877 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
878 must = utf8_target ? prog->float_utf8 : prog->float_substr;
879 /* fbm_instr() takes into account exact value of end-of-str
880 if the check is SvTAIL(ed). Since false positives are OK,
881 and end-of-str is not later than strend we are OK. */
882 if (must == &PL_sv_undef) {
884 DEBUG_r(must = prog->float_utf8); /* for debug message */
887 s = fbm_instr((unsigned char*)s,
888 (unsigned char*)last + SvCUR(must)
890 must, multiline ? FBMrf_MULTILINE : 0);
892 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
893 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
894 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
895 (s ? "Found" : "Contradicts"),
896 quoted, RE_SV_TAIL(must));
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
901 ", giving up...\n"));
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
905 ", trying anchored starting at offset %ld...\n",
906 (long)(saved_s + 1 - i_strpos)));
908 s = HOP3c(t, 1, strend);
912 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
913 (long)(s - i_strpos)));
914 other_last = s; /* Fix this later. --Hugo */
924 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
926 DEBUG_OPTIMISE_MORE_r(
927 PerlIO_printf(Perl_debug_log,
928 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
929 (IV)prog->check_offset_min,
930 (IV)prog->check_offset_max,
938 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
940 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
943 /* Fixed substring is found far enough so that the match
944 cannot start at strpos. */
946 if (ml_anch && t[-1] != '\n') {
947 /* Eventually fbm_*() should handle this, but often
948 anchored_offset is not 0, so this check will not be wasted. */
949 /* XXXX In the code below we prefer to look for "^" even in
950 presence of anchored substrings. And we search even
951 beyond the found float position. These pessimizations
952 are historical artefacts only. */
954 while (t < strend - prog->minlen) {
956 if (t < check_at - prog->check_offset_min) {
957 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
958 /* Since we moved from the found position,
959 we definitely contradict the found anchored
960 substr. Due to the above check we do not
961 contradict "check" substr.
962 Thus we can arrive here only if check substr
963 is float. Redo checking for "other"=="fixed".
966 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
967 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
968 goto do_other_anchored;
970 /* We don't contradict the found floating substring. */
971 /* XXXX Why not check for STCLASS? */
973 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
974 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
977 /* Position contradicts check-string */
978 /* XXXX probably better to look for check-string
979 than for "\n", so one should lower the limit for t? */
980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
981 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
982 other_last = strpos = s = t + 1;
987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
988 PL_colors[0], PL_colors[1]));
992 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
993 PL_colors[0], PL_colors[1]));
997 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1000 /* The found string does not prohibit matching at strpos,
1001 - no optimization of calling REx engine can be performed,
1002 unless it was an MBOL and we are not after MBOL,
1003 or a future STCLASS check will fail this. */
1005 /* Even in this situation we may use MBOL flag if strpos is offset
1006 wrt the start of the string. */
1007 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1008 && (strpos != strbeg) && strpos[-1] != '\n'
1009 /* May be due to an implicit anchor of m{.*foo} */
1010 && !(prog->intflags & PREGf_IMPLICIT))
1015 DEBUG_EXECUTE_r( if (ml_anch)
1016 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1017 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1020 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1022 prog->check_utf8 /* Could be deleted already */
1023 && --BmUSEFUL(prog->check_utf8) < 0
1024 && (prog->check_utf8 == prog->float_utf8)
1026 prog->check_substr /* Could be deleted already */
1027 && --BmUSEFUL(prog->check_substr) < 0
1028 && (prog->check_substr == prog->float_substr)
1031 /* If flags & SOMETHING - do not do it many times on the same match */
1032 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1033 /* XXX Does the destruction order has to change with utf8_target? */
1034 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1035 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1036 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1037 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1038 check = NULL; /* abort */
1040 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag
1041 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1042 if (prog->intflags & PREGf_IMPLICIT)
1043 prog->extflags &= ~RXf_ANCH_MBOL;
1044 /* XXXX This is a remnant of the old implementation. It
1045 looks wasteful, since now INTUIT can use many
1046 other heuristics. */
1047 prog->extflags &= ~RXf_USE_INTUIT;
1048 /* XXXX What other flags might need to be cleared in this branch? */
1054 /* Last resort... */
1055 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1056 /* trie stclasses are too expensive to use here, we are better off to
1057 leave it to regmatch itself */
1058 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1059 /* minlen == 0 is possible if regstclass is \b or \B,
1060 and the fixed substr is ''$.
1061 Since minlen is already taken into account, s+1 is before strend;
1062 accidentally, minlen >= 1 guaranties no false positives at s + 1
1063 even for \b or \B. But (minlen? 1 : 0) below assumes that
1064 regstclass does not come from lookahead... */
1065 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1066 This leaves EXACTF-ish only, which are dealt with in find_byclass(). */
1067 const U8* const str = (U8*)STRING(progi->regstclass);
1068 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1069 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1072 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1073 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1074 else if (prog->float_substr || prog->float_utf8)
1075 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1079 if (checked_upto < s)
1081 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1082 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1085 s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL);
1090 const char *what = NULL;
1092 if (endpos == strend) {
1093 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1094 "Could not match STCLASS...\n") );
1097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098 "This position contradicts STCLASS...\n") );
1099 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1101 checked_upto = HOPBACKc(endpos, start_shift);
1102 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n",
1103 (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg)));
1104 /* Contradict one of substrings */
1105 if (prog->anchored_substr || prog->anchored_utf8) {
1106 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1107 DEBUG_EXECUTE_r( what = "anchored" );
1109 s = HOP3c(t, 1, strend);
1110 if (s + start_shift + end_shift > strend) {
1111 /* XXXX Should be taken into account earlier? */
1112 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1113 "Could not match STCLASS...\n") );
1118 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1119 "Looking for %s substr starting at offset %ld...\n",
1120 what, (long)(s + start_shift - i_strpos)) );
1123 /* Have both, check_string is floating */
1124 if (t + start_shift >= check_at) /* Contradicts floating=check */
1125 goto retry_floating_check;
1126 /* Recheck anchored substring, but not floating... */
1130 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1131 "Looking for anchored substr starting at offset %ld...\n",
1132 (long)(other_last - i_strpos)) );
1133 goto do_other_anchored;
1135 /* Another way we could have checked stclass at the
1136 current position only: */
1141 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1142 "Looking for /%s^%s/m starting at offset %ld...\n",
1143 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1146 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1148 /* Check is floating substring. */
1149 retry_floating_check:
1150 t = check_at - start_shift;
1151 DEBUG_EXECUTE_r( what = "floating" );
1152 goto hop_and_restart;
1155 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1156 "By STCLASS: moving %ld --> %ld\n",
1157 (long)(t - i_strpos), (long)(s - i_strpos))
1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1162 "Does not contradict STCLASS...\n");
1167 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1168 PL_colors[4], (check ? "Guessed" : "Giving up"),
1169 PL_colors[5], (long)(s - i_strpos)) );
1172 fail_finish: /* Substring not found */
1173 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1174 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1176 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1177 PL_colors[4], PL_colors[5]));
1181 #define DECL_TRIE_TYPE(scan) \
1182 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1183 trie_type = ((scan->flags == EXACT) \
1184 ? (utf8_target ? trie_utf8 : trie_plain) \
1185 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1187 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1188 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1190 switch (trie_type) { \
1191 case trie_utf8_fold: \
1192 if ( foldlen>0 ) { \
1193 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1198 uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \
1199 len = UTF8SKIP(uc); \
1200 skiplen = UNISKIP( uvc ); \
1201 foldlen -= skiplen; \
1202 uscan = foldbuf + skiplen; \
1205 case trie_latin_utf8_fold: \
1206 if ( foldlen>0 ) { \
1207 uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1213 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1214 skiplen = UNISKIP( uvc ); \
1215 foldlen -= skiplen; \
1216 uscan = foldbuf + skiplen; \
1220 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1227 charid = trie->charmap[ uvc ]; \
1231 if (widecharmap) { \
1232 SV** const svpp = hv_fetch(widecharmap, \
1233 (char*)&uvc, sizeof(UV), 0); \
1235 charid = (U16)SvIV(*svpp); \
1240 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1244 && (ln == 1 || folder(s, pat_string, ln)) \
1245 && (!reginfo || regtry(reginfo, &s)) ) \
1251 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1253 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1259 #define REXEC_FBC_SCAN(CoDe) \
1261 while (s < strend) { \
1267 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1268 REXEC_FBC_UTF8_SCAN( \
1270 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1279 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1282 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1291 #define REXEC_FBC_TRYIT \
1292 if ((!reginfo || regtry(reginfo, &s))) \
1295 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1296 if (utf8_target) { \
1297 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1300 REXEC_FBC_CLASS_SCAN(CoNd); \
1303 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1304 if (utf8_target) { \
1306 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1309 REXEC_FBC_CLASS_SCAN(CoNd); \
1312 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1313 PL_reg_flags |= RF_tainted; \
1314 if (utf8_target) { \
1315 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1318 REXEC_FBC_CLASS_SCAN(CoNd); \
1321 #define DUMP_EXEC_POS(li,s,doutf8) \
1322 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1325 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1326 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1327 tmp = TEST_NON_UTF8(tmp); \
1328 REXEC_FBC_UTF8_SCAN( \
1329 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1338 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1339 if (s == PL_bostr) { \
1343 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \
1344 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \
1347 LOAD_UTF8_CHARCLASS_ALNUM(); \
1348 REXEC_FBC_UTF8_SCAN( \
1349 if (tmp == ! (TeSt2_UtF8)) { \
1358 /* The only difference between the BOUND and NBOUND cases is that
1359 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1360 * NBOUND. This is accomplished by passing it in either the if or else clause,
1361 * with the other one being empty */
1362 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1363 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1365 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1366 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1368 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1369 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1371 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1372 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1375 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1376 * be passed in completely with the variable name being tested, which isn't
1377 * such a clean interface, but this is easier to read than it was before. We
1378 * are looking for the boundary (or non-boundary between a word and non-word
1379 * character. The utf8 and non-utf8 cases have the same logic, but the details
1380 * must be different. Find the "wordness" of the character just prior to this
1381 * one, and compare it with the wordness of this one. If they differ, we have
1382 * a boundary. At the beginning of the string, pretend that the previous
1383 * character was a new-line */
1384 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1385 if (utf8_target) { \
1388 else { /* Not utf8 */ \
1389 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \
1390 tmp = TEST_NON_UTF8(tmp); \
1392 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1401 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \
1404 /* We know what class REx starts with. Try to find this position... */
1405 /* if reginfo is NULL, its a dryrun */
1406 /* annoyingly all the vars in this routine have different names from their counterparts
1407 in regmatch. /grrr */
1410 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1411 const char *strend, regmatch_info *reginfo)
1414 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1415 char *pat_string; /* The pattern's exactish string */
1416 char *pat_end; /* ptr to end char of pat_string */
1417 re_fold_t folder; /* Function for computing non-utf8 folds */
1418 const U8 *fold_array; /* array for folding ords < 256 */
1425 I32 tmp = 1; /* Scratch variable? */
1426 const bool utf8_target = PL_reg_match_utf8;
1427 UV utf8_fold_flags = 0;
1428 RXi_GET_DECL(prog,progi);
1430 PERL_ARGS_ASSERT_FIND_BYCLASS;
1432 /* We know what class it must start with. */
1436 if (utf8_target || OP(c) == ANYOFV) {
1437 STRLEN inclasslen = strend - s;
1438 REXEC_FBC_UTF8_CLASS_SCAN(
1439 reginclass(prog, c, (U8*)s, &inclasslen, utf8_target));
1442 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1447 if (tmp && (!reginfo || regtry(reginfo, &s)))
1455 if (UTF_PATTERN || utf8_target) {
1456 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1457 goto do_exactf_utf8;
1459 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1460 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1461 goto do_exactf_non_utf8; /* isn't dealt with by these */
1466 /* regcomp.c already folded this if pattern is in UTF-8 */
1467 utf8_fold_flags = 0;
1468 goto do_exactf_utf8;
1470 fold_array = PL_fold;
1472 goto do_exactf_non_utf8;
1475 if (UTF_PATTERN || utf8_target) {
1476 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
1477 goto do_exactf_utf8;
1479 fold_array = PL_fold_locale;
1480 folder = foldEQ_locale;
1481 goto do_exactf_non_utf8;
1485 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1487 goto do_exactf_utf8;
1489 case EXACTFU_TRICKYFOLD:
1491 if (UTF_PATTERN || utf8_target) {
1492 utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1493 goto do_exactf_utf8;
1496 /* Any 'ss' in the pattern should have been replaced by regcomp,
1497 * so we don't have to worry here about this single special case
1498 * in the Latin1 range */
1499 fold_array = PL_fold_latin1;
1500 folder = foldEQ_latin1;
1504 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1505 are no glitches with fold-length differences
1506 between the target string and pattern */
1508 /* The idea in the non-utf8 EXACTF* cases is to first find the
1509 * first character of the EXACTF* node and then, if necessary,
1510 * case-insensitively compare the full text of the node. c1 is the
1511 * first character. c2 is its fold. This logic will not work for
1512 * Unicode semantics and the german sharp ss, which hence should
1513 * not be compiled into a node that gets here. */
1514 pat_string = STRING(c);
1515 ln = STR_LEN(c); /* length to match in octets/bytes */
1517 /* We know that we have to match at least 'ln' bytes (which is the
1518 * same as characters, since not utf8). If we have to match 3
1519 * characters, and there are only 2 availabe, we know without
1520 * trying that it will fail; so don't start a match past the
1521 * required minimum number from the far end */
1522 e = HOP3c(strend, -((I32)ln), s);
1524 if (!reginfo && e < s) {
1525 e = s; /* Due to minlen logic of intuit() */
1529 c2 = fold_array[c1];
1530 if (c1 == c2) { /* If char and fold are the same */
1531 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1534 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1543 /* If one of the operands is in utf8, we can't use the simpler
1544 * folding above, due to the fact that many different characters
1545 * can have the same fold, or portion of a fold, or different-
1547 pat_string = STRING(c);
1548 ln = STR_LEN(c); /* length to match in octets/bytes */
1549 pat_end = pat_string + ln;
1550 lnc = (UTF_PATTERN) /* length to match in characters */
1551 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1554 /* We have 'lnc' characters to match in the pattern, but because of
1555 * multi-character folding, each character in the target can match
1556 * up to 3 characters (Unicode guarantees it will never exceed
1557 * this) if it is utf8-encoded; and up to 2 if not (based on the
1558 * fact that the Latin 1 folds are already determined, and the
1559 * only multi-char fold in that range is the sharp-s folding to
1560 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1561 * string character. Adjust lnc accordingly, rounding up, so that
1562 * if we need to match at least 4+1/3 chars, that really is 5. */
1563 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1564 lnc = (lnc + expansion - 1) / expansion;
1566 /* As in the non-UTF8 case, if we have to match 3 characters, and
1567 * only 2 are left, it's guaranteed to fail, so don't start a
1568 * match that would require us to go beyond the end of the string
1570 e = HOP3c(strend, -((I32)lnc), s);
1572 if (!reginfo && e < s) {
1573 e = s; /* Due to minlen logic of intuit() */
1576 /* XXX Note that we could recalculate e to stop the loop earlier,
1577 * as the worst case expansion above will rarely be met, and as we
1578 * go along we would usually find that e moves further to the left.
1579 * This would happen only after we reached the point in the loop
1580 * where if there were no expansion we should fail. Unclear if
1581 * worth the expense */
1584 char *my_strend= (char *)strend;
1585 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1586 pat_string, NULL, ln, cBOOL(UTF_PATTERN), utf8_fold_flags)
1587 && (!reginfo || regtry(reginfo, &s)) )
1591 s += (utf8_target) ? UTF8SKIP(s) : 1;
1596 PL_reg_flags |= RF_tainted;
1597 FBC_BOUND(isALNUM_LC,
1598 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1599 isALNUM_LC_utf8((U8*)s));
1602 PL_reg_flags |= RF_tainted;
1603 FBC_NBOUND(isALNUM_LC,
1604 isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
1605 isALNUM_LC_utf8((U8*)s));
1608 FBC_BOUND(isWORDCHAR,
1610 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1613 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1615 isWORDCHAR_A((U8*)s));
1618 FBC_NBOUND(isWORDCHAR,
1620 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1623 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1625 isWORDCHAR_A((U8*)s));
1628 FBC_BOUND(isWORDCHAR_L1,
1630 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1633 FBC_NBOUND(isWORDCHAR_L1,
1635 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)));
1638 REXEC_FBC_CSCAN_TAINT(
1639 isALNUM_LC_utf8((U8*)s),
1644 REXEC_FBC_CSCAN_PRELOAD(
1645 LOAD_UTF8_CHARCLASS_ALNUM(),
1646 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1647 isWORDCHAR_L1((U8) *s)
1651 REXEC_FBC_CSCAN_PRELOAD(
1652 LOAD_UTF8_CHARCLASS_ALNUM(),
1653 swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1658 /* Don't need to worry about utf8, as it can match only a single
1659 * byte invariant character */
1660 REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s));
1663 REXEC_FBC_CSCAN_PRELOAD(
1664 LOAD_UTF8_CHARCLASS_ALNUM(),
1665 !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target),
1666 ! isWORDCHAR_L1((U8) *s)
1670 REXEC_FBC_CSCAN_PRELOAD(
1671 LOAD_UTF8_CHARCLASS_ALNUM(),
1672 !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target),
1683 REXEC_FBC_CSCAN_TAINT(
1684 !isALNUM_LC_utf8((U8*)s),
1689 REXEC_FBC_CSCAN_PRELOAD(
1690 LOAD_UTF8_CHARCLASS_SPACE(),
1691 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1696 REXEC_FBC_CSCAN_PRELOAD(
1697 LOAD_UTF8_CHARCLASS_SPACE(),
1698 *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target),
1703 /* Don't need to worry about utf8, as it can match only a single
1704 * byte invariant character */
1705 REXEC_FBC_CLASS_SCAN( isSPACE_A(*s));
1708 REXEC_FBC_CSCAN_TAINT(
1709 isSPACE_LC_utf8((U8*)s),
1714 REXEC_FBC_CSCAN_PRELOAD(
1715 LOAD_UTF8_CHARCLASS_SPACE(),
1716 !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1717 ! isSPACE_L1((U8) *s)
1721 REXEC_FBC_CSCAN_PRELOAD(
1722 LOAD_UTF8_CHARCLASS_SPACE(),
1723 !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)),
1734 REXEC_FBC_CSCAN_TAINT(
1735 !isSPACE_LC_utf8((U8*)s),
1740 REXEC_FBC_CSCAN_PRELOAD(
1741 LOAD_UTF8_CHARCLASS_DIGIT(),
1742 swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1747 /* Don't need to worry about utf8, as it can match only a single
1748 * byte invariant character */
1749 REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s));
1752 REXEC_FBC_CSCAN_TAINT(
1753 isDIGIT_LC_utf8((U8*)s),
1758 REXEC_FBC_CSCAN_PRELOAD(
1759 LOAD_UTF8_CHARCLASS_DIGIT(),
1760 !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target),
1771 REXEC_FBC_CSCAN_TAINT(
1772 !isDIGIT_LC_utf8((U8*)s),
1779 is_LNBREAK_latin1(s)
1791 !is_VERTWS_latin1(s)
1797 is_HORIZWS_latin1(s)
1802 !is_HORIZWS_utf8(s),
1803 !is_HORIZWS_latin1(s)
1807 /* Don't need to worry about utf8, as it can match only a single
1808 * byte invariant character. The flag in this node type is the
1809 * class number to pass to _generic_isCC() to build a mask for
1810 * searching in PL_charclass[] */
1811 REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c)));
1815 !_generic_isCC_A(*s, FLAGS(c)),
1816 !_generic_isCC_A(*s, FLAGS(c))
1824 /* what trie are we using right now */
1826 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1828 = (reg_trie_data*)progi->data->data[ aho->trie ];
1829 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1831 const char *last_start = strend - trie->minlen;
1833 const char *real_start = s;
1835 STRLEN maxlen = trie->maxlen;
1837 U8 **points; /* map of where we were in the input string
1838 when reading a given char. For ASCII this
1839 is unnecessary overhead as the relationship
1840 is always 1:1, but for Unicode, especially
1841 case folded Unicode this is not true. */
1842 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1846 GET_RE_DEBUG_FLAGS_DECL;
1848 /* We can't just allocate points here. We need to wrap it in
1849 * an SV so it gets freed properly if there is a croak while
1850 * running the match */
1853 sv_points=newSV(maxlen * sizeof(U8 *));
1854 SvCUR_set(sv_points,
1855 maxlen * sizeof(U8 *));
1856 SvPOK_on(sv_points);
1857 sv_2mortal(sv_points);
1858 points=(U8**)SvPV_nolen(sv_points );
1859 if ( trie_type != trie_utf8_fold
1860 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1863 bitmap=(U8*)trie->bitmap;
1865 bitmap=(U8*)ANYOF_BITMAP(c);
1867 /* this is the Aho-Corasick algorithm modified a touch
1868 to include special handling for long "unknown char"
1869 sequences. The basic idea being that we use AC as long
1870 as we are dealing with a possible matching char, when
1871 we encounter an unknown char (and we have not encountered
1872 an accepting state) we scan forward until we find a legal
1874 AC matching is basically that of trie matching, except
1875 that when we encounter a failing transition, we fall back
1876 to the current states "fail state", and try the current char
1877 again, a process we repeat until we reach the root state,
1878 state 1, or a legal transition. If we fail on the root state
1879 then we can either terminate if we have reached an accepting
1880 state previously, or restart the entire process from the beginning
1884 while (s <= last_start) {
1885 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1893 U8 *uscan = (U8*)NULL;
1894 U8 *leftmost = NULL;
1896 U32 accepted_word= 0;
1900 while ( state && uc <= (U8*)strend ) {
1902 U32 word = aho->states[ state ].wordnum;
1906 DEBUG_TRIE_EXECUTE_r(
1907 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1908 dump_exec_pos( (char *)uc, c, strend, real_start,
1909 (char *)uc, utf8_target );
1910 PerlIO_printf( Perl_debug_log,
1911 " Scanning for legal start char...\n");
1915 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1919 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1925 if (uc >(U8*)last_start) break;
1929 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1930 if (!leftmost || lpos < leftmost) {
1931 DEBUG_r(accepted_word=word);
1937 points[pointpos++ % maxlen]= uc;
1938 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1939 uscan, len, uvc, charid, foldlen,
1941 DEBUG_TRIE_EXECUTE_r({
1942 dump_exec_pos( (char *)uc, c, strend, real_start,
1944 PerlIO_printf(Perl_debug_log,
1945 " Charid:%3u CP:%4"UVxf" ",
1951 word = aho->states[ state ].wordnum;
1953 base = aho->states[ state ].trans.base;
1955 DEBUG_TRIE_EXECUTE_r({
1957 dump_exec_pos( (char *)uc, c, strend, real_start,
1959 PerlIO_printf( Perl_debug_log,
1960 "%sState: %4"UVxf", word=%"UVxf,
1961 failed ? " Fail transition to " : "",
1962 (UV)state, (UV)word);
1968 ( ((offset = base + charid
1969 - 1 - trie->uniquecharcount)) >= 0)
1970 && ((U32)offset < trie->lasttrans)
1971 && trie->trans[offset].check == state
1972 && (tmp=trie->trans[offset].next))
1974 DEBUG_TRIE_EXECUTE_r(
1975 PerlIO_printf( Perl_debug_log," - legal\n"));
1980 DEBUG_TRIE_EXECUTE_r(
1981 PerlIO_printf( Perl_debug_log," - fail\n"));
1983 state = aho->fail[state];
1987 /* we must be accepting here */
1988 DEBUG_TRIE_EXECUTE_r(
1989 PerlIO_printf( Perl_debug_log," - accepting\n"));
1998 if (!state) state = 1;
2001 if ( aho->states[ state ].wordnum ) {
2002 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2003 if (!leftmost || lpos < leftmost) {
2004 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2009 s = (char*)leftmost;
2010 DEBUG_TRIE_EXECUTE_r({
2012 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2013 (UV)accepted_word, (IV)(s - real_start)
2016 if (!reginfo || regtry(reginfo, &s)) {
2022 DEBUG_TRIE_EXECUTE_r({
2023 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2026 DEBUG_TRIE_EXECUTE_r(
2027 PerlIO_printf( Perl_debug_log,"No match.\n"));
2036 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2046 - regexec_flags - match a regexp against a string
2049 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
2050 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
2051 /* stringarg: the point in the string at which to begin matching */
2052 /* strend: pointer to null at end of string */
2053 /* strbeg: real beginning of string */
2054 /* minend: end of match must be >= minend bytes after stringarg. */
2055 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2056 * itself is accessed via the pointers above */
2057 /* data: May be used for some additional optimizations.
2058 Currently its only used, with a U32 cast, for transmitting
2059 the ganch offset when doing a /g match. This will change */
2060 /* nosave: For optimizations. */
2064 struct regexp *const prog = (struct regexp *)SvANY(rx);
2065 /*register*/ char *s;
2067 /*register*/ char *startpos = stringarg;
2068 I32 minlen; /* must match at least this many chars */
2069 I32 dontbother = 0; /* how many characters not to try at end */
2070 I32 end_shift = 0; /* Same for the end. */ /* CC */
2071 I32 scream_pos = -1; /* Internal iterator of scream. */
2072 char *scream_olds = NULL;
2073 const bool utf8_target = cBOOL(DO_UTF8(sv));
2075 RXi_GET_DECL(prog,progi);
2076 regmatch_info reginfo; /* create some info to pass to regtry etc */
2077 regexp_paren_pair *swap = NULL;
2078 GET_RE_DEBUG_FLAGS_DECL;
2080 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2081 PERL_UNUSED_ARG(data);
2083 /* Be paranoid... */
2084 if (prog == NULL || startpos == NULL) {
2085 Perl_croak(aTHX_ "NULL regexp parameter");
2089 multiline = prog->extflags & RXf_PMf_MULTILINE;
2090 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
2092 RX_MATCH_UTF8_set(rx, utf8_target);
2094 debug_start_match(rx, utf8_target, startpos, strend,
2098 minlen = prog->minlen;
2100 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2102 "String too short [regexec_flags]...\n"));
2107 /* Check validity of program. */
2108 if (UCHARAT(progi->program) != REG_MAGIC) {
2109 Perl_croak(aTHX_ "corrupted regexp program");
2113 PL_reg_state.re_state_eval_setup_done = FALSE;
2117 PL_reg_flags |= RF_utf8;
2119 /* Mark beginning of line for ^ and lookbehind. */
2120 reginfo.bol = startpos; /* XXX not used ??? */
2124 /* Mark end of line for $ (and such) */
2127 /* see how far we have to get to not match where we matched before */
2128 reginfo.till = startpos+minend;
2130 /* If there is a "must appear" string, look for it. */
2133 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2135 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2136 reginfo.ganch = startpos + prog->gofs;
2137 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2138 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2139 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2141 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2142 && mg->mg_len >= 0) {
2143 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2144 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2145 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2147 if (prog->extflags & RXf_ANCH_GPOS) {
2148 if (s > reginfo.ganch)
2150 s = reginfo.ganch - prog->gofs;
2151 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2152 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2158 reginfo.ganch = strbeg + PTR2UV(data);
2159 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2160 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2162 } else { /* pos() not defined */
2163 reginfo.ganch = strbeg;
2164 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2165 "GPOS: reginfo.ganch = strbeg\n"));
2168 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2169 /* We have to be careful. If the previous successful match
2170 was from this regex we don't want a subsequent partially
2171 successful match to clobber the old results.
2172 So when we detect this possibility we add a swap buffer
2173 to the re, and switch the buffer each match. If we fail
2174 we switch it back, otherwise we leave it swapped.
2177 /* do we need a save destructor here for eval dies? */
2178 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2179 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2180 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2186 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2187 re_scream_pos_data d;
2189 d.scream_olds = &scream_olds;
2190 d.scream_pos = &scream_pos;
2191 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2193 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2194 goto phooey; /* not present */
2200 /* Simplest case: anchored match need be tried only once. */
2201 /* [unless only anchor is BOL and multiline is set] */
2202 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2203 if (s == startpos && regtry(®info, &startpos))
2205 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2206 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2211 dontbother = minlen - 1;
2212 end = HOP3c(strend, -dontbother, strbeg) - 1;
2213 /* for multiline we only have to try after newlines */
2214 if (prog->check_substr || prog->check_utf8) {
2215 /* because of the goto we can not easily reuse the macros for bifurcating the
2216 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2219 goto after_try_utf8;
2221 if (regtry(®info, &s)) {
2228 if (prog->extflags & RXf_USE_INTUIT) {
2229 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2238 } /* end search for check string in unicode */
2240 if (s == startpos) {
2241 goto after_try_latin;
2244 if (regtry(®info, &s)) {
2251 if (prog->extflags & RXf_USE_INTUIT) {
2252 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2261 } /* end search for check string in latin*/
2262 } /* end search for check string */
2263 else { /* search for newline */
2265 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2268 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2269 while (s <= end) { /* note it could be possible to match at the end of the string */
2270 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2271 if (regtry(®info, &s))
2275 } /* end search for newline */
2276 } /* end anchored/multiline check string search */
2278 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2280 /* the warning about reginfo.ganch being used without initialization
2281 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2282 and we only enter this block when the same bit is set. */
2283 char *tmp_s = reginfo.ganch - prog->gofs;
2285 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2290 /* Messy cases: unanchored match. */
2291 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2292 /* we have /x+whatever/ */
2293 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2298 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2299 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2300 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2305 DEBUG_EXECUTE_r( did_match = 1 );
2306 if (regtry(®info, &s)) goto got_it;
2308 while (s < strend && *s == ch)
2316 DEBUG_EXECUTE_r( did_match = 1 );
2317 if (regtry(®info, &s)) goto got_it;
2319 while (s < strend && *s == ch)
2324 DEBUG_EXECUTE_r(if (!did_match)
2325 PerlIO_printf(Perl_debug_log,
2326 "Did not find anchored character...\n")
2329 else if (prog->anchored_substr != NULL
2330 || prog->anchored_utf8 != NULL
2331 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2332 && prog->float_max_offset < strend - s)) {
2337 char *last1; /* Last position checked before */
2341 if (prog->anchored_substr || prog->anchored_utf8) {
2342 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2343 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2344 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2345 back_max = back_min = prog->anchored_offset;
2347 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2348 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2349 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2350 back_max = prog->float_max_offset;
2351 back_min = prog->float_min_offset;
2355 if (must == &PL_sv_undef)
2356 /* could not downgrade utf8 check substring, so must fail */
2362 last = HOP3c(strend, /* Cannot start after this */
2363 -(I32)(CHR_SVLEN(must)
2364 - (SvTAIL(must) != 0) + back_min), strbeg);
2367 last1 = HOPc(s, -1);
2369 last1 = s - 1; /* bogus */
2371 /* XXXX check_substr already used to find "s", can optimize if
2372 check_substr==must. */
2374 dontbother = end_shift;
2375 strend = HOPc(strend, -dontbother);
2376 while ( (s <= last) &&
2377 (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2378 (unsigned char*)strend, must,
2379 multiline ? FBMrf_MULTILINE : 0)) ) {
2380 DEBUG_EXECUTE_r( did_match = 1 );
2381 if (HOPc(s, -back_max) > last1) {
2382 last1 = HOPc(s, -back_min);
2383 s = HOPc(s, -back_max);
2386 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2388 last1 = HOPc(s, -back_min);
2392 while (s <= last1) {
2393 if (regtry(®info, &s))
2399 while (s <= last1) {
2400 if (regtry(®info, &s))
2406 DEBUG_EXECUTE_r(if (!did_match) {
2407 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2408 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2409 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2410 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2411 ? "anchored" : "floating"),
2412 quoted, RE_SV_TAIL(must));
2416 else if ( (c = progi->regstclass) ) {
2418 const OPCODE op = OP(progi->regstclass);
2419 /* don't bother with what can't match */
2420 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2421 strend = HOPc(strend, -(minlen - 1));
2424 SV * const prop = sv_newmortal();
2425 regprop(prog, prop, c);
2427 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2429 PerlIO_printf(Perl_debug_log,
2430 "Matching stclass %.*s against %s (%d bytes)\n",
2431 (int)SvCUR(prop), SvPVX_const(prop),
2432 quoted, (int)(strend - s));
2435 if (find_byclass(prog, c, s, strend, ®info))
2437 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2441 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2448 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2449 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2450 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2452 little = SvPV_const(float_real, len);
2453 if (SvTAIL(float_real)) {
2454 /* This means that float_real contains an artificial \n on the end
2455 * due to the presence of something like this: /foo$/
2456 * where we can match both "foo" and "foo\n" at the end of the string.
2457 * So we have to compare the end of the string first against the float_real
2458 * without the \n and then against the full float_real with the string.
2459 * We have to watch out for cases where the string might be smaller
2460 * than the float_real or the float_real without the \n.
2462 char *checkpos= strend - len;
2464 PerlIO_printf(Perl_debug_log,
2465 "%sChecking for float_real.%s\n",
2466 PL_colors[4], PL_colors[5]));
2467 if (checkpos + 1 < strbeg) {
2468 /* can't match, even if we remove the trailing \n string is too short to match */
2470 PerlIO_printf(Perl_debug_log,
2471 "%sString shorter than required trailing substring, cannot match.%s\n",
2472 PL_colors[4], PL_colors[5]));
2474 } else if (memEQ(checkpos + 1, little, len - 1)) {
2475 /* can match, the end of the string matches without the "\n" */
2476 last = checkpos + 1;
2477 } else if (checkpos < strbeg) {
2478 /* cant match, string is too short when the "\n" is included */
2480 PerlIO_printf(Perl_debug_log,
2481 "%sString does not contain required trailing substring, cannot match.%s\n",
2482 PL_colors[4], PL_colors[5]));
2484 } else if (!multiline) {
2485 /* non multiline match, so compare with the "\n" at the end of the string */
2486 if (memEQ(checkpos, little, len)) {
2490 PerlIO_printf(Perl_debug_log,
2491 "%sString does not contain required trailing substring, cannot match.%s\n",
2492 PL_colors[4], PL_colors[5]));
2496 /* multiline match, so we have to search for a place where the full string is located */
2502 last = rninstr(s, strend, little, little + len);
2504 last = strend; /* matching "$" */
2507 /* at one point this block contained a comment which was probably
2508 * incorrect, which said that this was a "should not happen" case.
2509 * Even if it was true when it was written I am pretty sure it is
2510 * not anymore, so I have removed the comment and replaced it with
2513 PerlIO_printf(Perl_debug_log,
2514 "String does not contain required substring, cannot match.\n"
2518 dontbother = strend - last + prog->float_min_offset;
2520 if (minlen && (dontbother < minlen))
2521 dontbother = minlen - 1;
2522 strend -= dontbother; /* this one's always in bytes! */
2523 /* We don't know much -- general case. */
2526 if (regtry(®info, &s))
2535 if (regtry(®info, &s))
2537 } while (s++ < strend);
2547 PerlIO_printf(Perl_debug_log,
2548 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
2554 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2556 if (PL_reg_state.re_state_eval_setup_done)
2557 restore_pos(aTHX_ prog);
2558 if (RXp_PAREN_NAMES(prog))
2559 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2561 /* make sure $`, $&, $', and $digit will work later */
2562 if ( !(flags & REXEC_NOT_FIRST) ) {
2563 if (flags & REXEC_COPY_STR) {
2564 #ifdef PERL_OLD_COPY_ON_WRITE
2566 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2568 PerlIO_printf(Perl_debug_log,
2569 "Copy on write: regexp capture, type %d\n",
2572 RX_MATCH_COPY_FREE(rx);
2573 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2574 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2575 assert (SvPOKp(prog->saved_copy));
2576 prog->sublen = PL_regeol - strbeg;
2577 prog->suboffset = 0;
2578 prog->subcoffset = 0;
2583 I32 max = PL_regeol - strbeg;
2586 if ( (flags & REXEC_COPY_SKIP_POST)
2587 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2588 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2589 ) { /* don't copy $' part of string */
2592 /* calculate the right-most part of the string covered
2593 * by a capture. Due to look-ahead, this may be to
2594 * the right of $&, so we have to scan all captures */
2595 while (n <= prog->lastparen) {
2596 if (prog->offs[n].end > max)
2597 max = prog->offs[n].end;
2601 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2602 ? prog->offs[0].start
2604 assert(max >= 0 && max <= PL_regeol - strbeg);
2607 if ( (flags & REXEC_COPY_SKIP_PRE)
2608 && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */
2609 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2610 ) { /* don't copy $` part of string */
2613 /* calculate the left-most part of the string covered
2614 * by a capture. Due to look-behind, this may be to
2615 * the left of $&, so we have to scan all captures */
2616 while (min && n <= prog->lastparen) {
2617 if ( prog->offs[n].start != -1
2618 && prog->offs[n].start < min)
2620 min = prog->offs[n].start;
2624 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2625 && min > prog->offs[0].end
2627 min = prog->offs[0].end;
2631 assert(min >= 0 && min <= max && min <= PL_regeol - strbeg);
2634 if (RX_MATCH_COPIED(rx)) {
2635 if (sublen > prog->sublen)
2637 (char*)saferealloc(prog->subbeg, sublen+1);
2640 prog->subbeg = (char*)safemalloc(sublen+1);
2641 Copy(strbeg + min, prog->subbeg, sublen, char);
2642 prog->subbeg[sublen] = '\0';
2643 prog->suboffset = min;
2644 prog->sublen = sublen;
2645 RX_MATCH_COPIED_on(rx);
2647 prog->subcoffset = prog->suboffset;
2648 if (prog->suboffset && utf8_target) {
2649 /* Convert byte offset to chars.
2650 * XXX ideally should only compute this if @-/@+
2651 * has been seen, a la PL_sawampersand ??? */
2653 /* If there's a direct correspondence between the
2654 * string which we're matching and the original SV,
2655 * then we can use the utf8 len cache associated with
2656 * the SV. In particular, it means that under //g,
2657 * sv_pos_b2u() will use the previously cached
2658 * position to speed up working out the new length of
2659 * subcoffset, rather than counting from the start of
2660 * the string each time. This stops
2661 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2662 * from going quadratic */
2663 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2664 sv_pos_b2u(sv, &(prog->subcoffset));
2666 prog->subcoffset = utf8_length((U8*)strbeg,
2667 (U8*)(strbeg+prog->suboffset));
2671 RX_MATCH_COPY_FREE(rx);
2672 prog->subbeg = strbeg;
2673 prog->suboffset = 0;
2674 prog->subcoffset = 0;
2675 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2682 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2683 PL_colors[4], PL_colors[5]));
2684 if (PL_reg_state.re_state_eval_setup_done)
2685 restore_pos(aTHX_ prog);
2687 /* we failed :-( roll it back */
2688 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2689 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
2694 Safefree(prog->offs);
2702 /* Set which rex is pointed to by PL_reg_state, handling ref counting.
2703 * Do inc before dec, in case old and new rex are the same */
2704 #define SET_reg_curpm(Re2) \
2705 if (PL_reg_state.re_state_eval_setup_done) { \
2706 (void)ReREFCNT_inc(Re2); \
2707 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
2708 PM_SETRE((PL_reg_curpm), (Re2)); \
2713 - regtry - try match at specific point
2715 STATIC I32 /* 0 failure, 1 success */
2716 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
2720 REGEXP *const rx = reginfo->prog;
2721 regexp *const prog = (struct regexp *)SvANY(rx);
2723 RXi_GET_DECL(prog,progi);
2724 GET_RE_DEBUG_FLAGS_DECL;
2726 PERL_ARGS_ASSERT_REGTRY;
2728 reginfo->cutpoint=NULL;
2730 if ((prog->extflags & RXf_EVAL_SEEN)
2731 && !PL_reg_state.re_state_eval_setup_done)
2735 PL_reg_state.re_state_eval_setup_done = TRUE;
2737 /* Make $_ available to executed code. */
2738 if (reginfo->sv != DEFSV) {
2740 DEFSV_set(reginfo->sv);
2743 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2744 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2745 /* prepare for quick setting of pos */
2746 #ifdef PERL_OLD_COPY_ON_WRITE
2747 if (SvIsCOW(reginfo->sv))
2748 sv_force_normal_flags(reginfo->sv, 0);
2750 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2751 &PL_vtbl_mglob, NULL, 0);
2755 PL_reg_oldpos = mg->mg_len;
2756 SAVEDESTRUCTOR_X(restore_pos, prog);
2758 if (!PL_reg_curpm) {
2759 Newxz(PL_reg_curpm, 1, PMOP);
2762 SV* const repointer = &PL_sv_undef;
2763 /* this regexp is also owned by the new PL_reg_curpm, which
2764 will try to free it. */
2765 av_push(PL_regex_padav, repointer);
2766 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2767 PL_regex_pad = AvARRAY(PL_regex_padav);
2772 PL_reg_oldcurpm = PL_curpm;
2773 PL_curpm = PL_reg_curpm;
2774 if (RXp_MATCH_COPIED(prog)) {
2775 /* Here is a serious problem: we cannot rewrite subbeg,
2776 since it may be needed if this match fails. Thus
2777 $` inside (?{}) could fail... */
2778 PL_reg_oldsaved = prog->subbeg;
2779 PL_reg_oldsavedlen = prog->sublen;
2780 PL_reg_oldsavedoffset = prog->suboffset;
2781 PL_reg_oldsavedcoffset = prog->suboffset;
2782 #ifdef PERL_OLD_COPY_ON_WRITE
2783 PL_nrs = prog->saved_copy;
2785 RXp_MATCH_COPIED_off(prog);
2788 PL_reg_oldsaved = NULL;
2789 prog->subbeg = PL_bostr;
2790 prog->suboffset = 0;
2791 prog->subcoffset = 0;
2792 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2795 PL_reg_starttry = *startposp;
2797 prog->offs[0].start = *startposp - PL_bostr;
2798 prog->lastparen = 0;
2799 prog->lastcloseparen = 0;
2802 /* XXXX What this code is doing here?!!! There should be no need
2803 to do this again and again, prog->lastparen should take care of
2806 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2807 * Actually, the code in regcppop() (which Ilya may be meaning by
2808 * prog->lastparen), is not needed at all by the test suite
2809 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2810 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2811 * Meanwhile, this code *is* needed for the
2812 * above-mentioned test suite tests to succeed. The common theme
2813 * on those tests seems to be returning null fields from matches.
2814 * --jhi updated by dapm */
2816 if (prog->nparens) {
2817 regexp_paren_pair *pp = prog->offs;
2819 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
2827 result = regmatch(reginfo, *startposp, progi->program + 1);
2829 prog->offs[0].end = result;
2832 if (reginfo->cutpoint)
2833 *startposp= reginfo->cutpoint;
2834 REGCP_UNWIND(lastcp);
2839 #define sayYES goto yes
2840 #define sayNO goto no
2841 #define sayNO_SILENT goto no_silent
2843 /* we dont use STMT_START/END here because it leads to
2844 "unreachable code" warnings, which are bogus, but distracting. */
2845 #define CACHEsayNO \
2846 if (ST.cache_mask) \
2847 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2850 /* this is used to determine how far from the left messages like
2851 'failed...' are printed. It should be set such that messages
2852 are inline with the regop output that created them.
2854 #define REPORT_CODE_OFF 32
2857 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2858 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2860 #define SLAB_FIRST(s) (&(s)->states[0])
2861 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2863 /* grab a new slab and return the first slot in it */
2865 STATIC regmatch_state *
2868 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2871 regmatch_slab *s = PL_regmatch_slab->next;
2873 Newx(s, 1, regmatch_slab);
2874 s->prev = PL_regmatch_slab;
2876 PL_regmatch_slab->next = s;
2878 PL_regmatch_slab = s;
2879 return SLAB_FIRST(s);
2883 /* push a new state then goto it */
2885 #define PUSH_STATE_GOTO(state, node, input) \
2886 pushinput = input; \
2888 st->resume_state = state; \
2891 /* push a new state with success backtracking, then goto it */
2893 #define PUSH_YES_STATE_GOTO(state, node, input) \
2894 pushinput = input; \
2896 st->resume_state = state; \
2897 goto push_yes_state;
2904 regmatch() - main matching routine
2906 This is basically one big switch statement in a loop. We execute an op,
2907 set 'next' to point the next op, and continue. If we come to a point which
2908 we may need to backtrack to on failure such as (A|B|C), we push a
2909 backtrack state onto the backtrack stack. On failure, we pop the top
2910 state, and re-enter the loop at the state indicated. If there are no more
2911 states to pop, we return failure.
2913 Sometimes we also need to backtrack on success; for example /A+/, where
2914 after successfully matching one A, we need to go back and try to
2915 match another one; similarly for lookahead assertions: if the assertion
2916 completes successfully, we backtrack to the state just before the assertion
2917 and then carry on. In these cases, the pushed state is marked as
2918 'backtrack on success too'. This marking is in fact done by a chain of
2919 pointers, each pointing to the previous 'yes' state. On success, we pop to
2920 the nearest yes state, discarding any intermediate failure-only states.
2921 Sometimes a yes state is pushed just to force some cleanup code to be
2922 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2923 it to free the inner regex.
2925 Note that failure backtracking rewinds the cursor position, while
2926 success backtracking leaves it alone.
2928 A pattern is complete when the END op is executed, while a subpattern
2929 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2930 ops trigger the "pop to last yes state if any, otherwise return true"
2933 A common convention in this function is to use A and B to refer to the two
2934 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2935 the subpattern to be matched possibly multiple times, while B is the entire
2936 rest of the pattern. Variable and state names reflect this convention.
2938 The states in the main switch are the union of ops and failure/success of
2939 substates associated with with that op. For example, IFMATCH is the op
2940 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2941 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2942 successfully matched A and IFMATCH_A_fail is a state saying that we have
2943 just failed to match A. Resume states always come in pairs. The backtrack
2944 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2945 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2946 on success or failure.
2948 The struct that holds a backtracking state is actually a big union, with
2949 one variant for each major type of op. The variable st points to the
2950 top-most backtrack struct. To make the code clearer, within each
2951 block of code we #define ST to alias the relevant union.
2953 Here's a concrete example of a (vastly oversimplified) IFMATCH
2959 #define ST st->u.ifmatch
2961 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2962 ST.foo = ...; // some state we wish to save
2964 // push a yes backtrack state with a resume value of
2965 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2967 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
2970 case IFMATCH_A: // we have successfully executed A; now continue with B
2972 bar = ST.foo; // do something with the preserved value
2975 case IFMATCH_A_fail: // A failed, so the assertion failed
2976 ...; // do some housekeeping, then ...
2977 sayNO; // propagate the failure
2984 For any old-timers reading this who are familiar with the old recursive
2985 approach, the code above is equivalent to:
2987 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2996 ...; // do some housekeeping, then ...
2997 sayNO; // propagate the failure
3000 The topmost backtrack state, pointed to by st, is usually free. If you
3001 want to claim it, populate any ST.foo fields in it with values you wish to
3002 save, then do one of
3004 PUSH_STATE_GOTO(resume_state, node, newinput);
3005 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3007 which sets that backtrack state's resume value to 'resume_state', pushes a
3008 new free entry to the top of the backtrack stack, then goes to 'node'.
3009 On backtracking, the free slot is popped, and the saved state becomes the
3010 new free state. An ST.foo field in this new top state can be temporarily
3011 accessed to retrieve values, but once the main loop is re-entered, it
3012 becomes available for reuse.
3014 Note that the depth of the backtrack stack constantly increases during the
3015 left-to-right execution of the pattern, rather than going up and down with
3016 the pattern nesting. For example the stack is at its maximum at Z at the
3017 end of the pattern, rather than at X in the following:
3019 /(((X)+)+)+....(Y)+....Z/
3021 The only exceptions to this are lookahead/behind assertions and the cut,
3022 (?>A), which pop all the backtrack states associated with A before
3025 Backtrack state structs are allocated in slabs of about 4K in size.
3026 PL_regmatch_state and st always point to the currently active state,
3027 and PL_regmatch_slab points to the slab currently containing
3028 PL_regmatch_state. The first time regmatch() is called, the first slab is
3029 allocated, and is never freed until interpreter destruction. When the slab
3030 is full, a new one is allocated and chained to the end. At exit from
3031 regmatch(), slabs allocated since entry are freed.
3036 #define DEBUG_STATE_pp(pp) \
3038 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3039 PerlIO_printf(Perl_debug_log, \
3040 " %*s"pp" %s%s%s%s%s\n", \
3042 PL_reg_name[st->resume_state], \
3043 ((st==yes_state||st==mark_state) ? "[" : ""), \
3044 ((st==yes_state) ? "Y" : ""), \
3045 ((st==mark_state) ? "M" : ""), \
3046 ((st==yes_state||st==mark_state) ? "]" : "") \
3051 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3056 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3057 const char *start, const char *end, const char *blurb)
3059 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3061 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3066 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3067 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3069 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3070 start, end - start, 60);
3072 PerlIO_printf(Perl_debug_log,
3073 "%s%s REx%s %s against %s\n",
3074 PL_colors[4], blurb, PL_colors[5], s0, s1);
3076 if (utf8_target||utf8_pat)
3077 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3078 utf8_pat ? "pattern" : "",
3079 utf8_pat && utf8_target ? " and " : "",
3080 utf8_target ? "string" : ""
3086 S_dump_exec_pos(pTHX_ const char *locinput,
3087 const regnode *scan,
3088 const char *loc_regeol,
3089 const char *loc_bostr,
3090 const char *loc_reg_starttry,
3091 const bool utf8_target)
3093 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3094 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3095 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3096 /* The part of the string before starttry has one color
3097 (pref0_len chars), between starttry and current
3098 position another one (pref_len - pref0_len chars),
3099 after the current position the third one.
3100 We assume that pref0_len <= pref_len, otherwise we
3101 decrease pref0_len. */
3102 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3103 ? (5 + taill) - l : locinput - loc_bostr;
3106 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3108 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3110 pref0_len = pref_len - (locinput - loc_reg_starttry);
3111 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3112 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3113 ? (5 + taill) - pref_len : loc_regeol - locinput);
3114 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3118 if (pref0_len > pref_len)
3119 pref0_len = pref_len;
3121 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3123 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3124 (locinput - pref_len),pref0_len, 60, 4, 5);
3126 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3127 (locinput - pref_len + pref0_len),
3128 pref_len - pref0_len, 60, 2, 3);
3130 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3131 locinput, loc_regeol - locinput, 10, 0, 1);
3133 const STRLEN tlen=len0+len1+len2;
3134 PerlIO_printf(Perl_debug_log,
3135 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3136 (IV)(locinput - loc_bostr),
3139 (docolor ? "" : "> <"),
3141 (int)(tlen > 19 ? 0 : 19 - tlen),
3148 /* reg_check_named_buff_matched()
3149 * Checks to see if a named buffer has matched. The data array of
3150 * buffer numbers corresponding to the buffer is expected to reside
3151 * in the regexp->data->data array in the slot stored in the ARG() of
3152 * node involved. Note that this routine doesn't actually care about the
3153 * name, that information is not preserved from compilation to execution.
3154 * Returns the index of the leftmost defined buffer with the given name
3155 * or 0 if non of the buffers matched.
3158 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3161 RXi_GET_DECL(rex,rexi);
3162 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3163 I32 *nums=(I32*)SvPVX(sv_dat);
3165 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3167 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3168 if ((I32)rex->lastparen >= nums[n] &&
3169 rex->offs[nums[n]].end != -1)
3178 /* free all slabs above current one - called during LEAVE_SCOPE */
3181 S_clear_backtrack_stack(pTHX_ void *p)
3183 regmatch_slab *s = PL_regmatch_slab->next;
3188 PL_regmatch_slab->next = NULL;
3190 regmatch_slab * const osl = s;
3197 /* returns -1 on failure, $+[0] on success */
3199 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3201 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3205 const bool utf8_target = PL_reg_match_utf8;
3206 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3207 REGEXP *rex_sv = reginfo->prog;
3208 regexp *rex = (struct regexp *)SvANY(rex_sv);
3209 RXi_GET_DECL(rex,rexi);
3211 /* the current state. This is a cached copy of PL_regmatch_state */
3213 /* cache heavy used fields of st in registers */
3216 U32 n = 0; /* general value; init to avoid compiler warning */
3217 I32 ln = 0; /* len or last; init to avoid compiler warning */
3218 char *locinput = startpos;
3219 char *pushinput; /* where to continue after a PUSH */
3220 I32 nextchr; /* is always set to UCHARAT(locinput) */
3222 bool result = 0; /* return value of S_regmatch */
3223 int depth = 0; /* depth of backtrack stack */
3224 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3225 const U32 max_nochange_depth =
3226 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3227 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3228 regmatch_state *yes_state = NULL; /* state to pop to on success of
3230 /* mark_state piggy backs on the yes_state logic so that when we unwind
3231 the stack on success we can update the mark_state as we go */
3232 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3233 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3234 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3236 bool no_final = 0; /* prevent failure from backtracking? */
3237 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3238 char *startpoint = locinput;
3239 SV *popmark = NULL; /* are we looking for a mark? */
3240 SV *sv_commit = NULL; /* last mark name seen in failure */
3241 SV *sv_yes_mark = NULL; /* last mark name we have seen
3242 during a successful match */
3243 U32 lastopen = 0; /* last open we saw */
3244 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3245 SV* const oreplsv = GvSV(PL_replgv);
3246 /* these three flags are set by various ops to signal information to
3247 * the very next op. They have a useful lifetime of exactly one loop
3248 * iteration, and are not preserved or restored by state pushes/pops
3250 bool sw = 0; /* the condition value in (?(cond)a|b) */
3251 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3252 int logical = 0; /* the following EVAL is:
3256 or the following IFMATCH/UNLESSM is:
3257 false: plain (?=foo)
3258 true: used as a condition: (?(?=foo))
3260 PAD* last_pad = NULL;
3262 I32 gimme = G_SCALAR;
3263 CV *caller_cv = NULL; /* who called us */
3264 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3265 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3268 GET_RE_DEBUG_FLAGS_DECL;
3271 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3272 multicall_oldcatch = 0;
3273 multicall_cv = NULL;
3275 PERL_UNUSED_VAR(multicall_cop);
3276 PERL_UNUSED_VAR(newsp);
3279 PERL_ARGS_ASSERT_REGMATCH;
3281 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3282 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3284 /* on first ever call to regmatch, allocate first slab */
3285 if (!PL_regmatch_slab) {
3286 Newx(PL_regmatch_slab, 1, regmatch_slab);
3287 PL_regmatch_slab->prev = NULL;
3288 PL_regmatch_slab->next = NULL;
3289 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3292 oldsave = PL_savestack_ix;
3293 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3294 SAVEVPTR(PL_regmatch_slab);
3295 SAVEVPTR(PL_regmatch_state);
3297 /* grab next free state slot */
3298 st = ++PL_regmatch_state;
3299 if (st > SLAB_LAST(PL_regmatch_slab))
3300 st = PL_regmatch_state = S_push_slab(aTHX);
3302 /* Note that nextchr is a byte even in UTF */
3303 nextchr = UCHARAT(locinput);
3305 while (scan != NULL) {
3308 SV * const prop = sv_newmortal();
3309 regnode *rnext=regnext(scan);
3310 DUMP_EXEC_POS( locinput, scan, utf8_target );
3311 regprop(rex, prop, scan);
3313 PerlIO_printf(Perl_debug_log,
3314 "%3"IVdf":%*s%s(%"IVdf")\n",
3315 (IV)(scan - rexi->program), depth*2, "",
3317 (PL_regkind[OP(scan)] == END || !rnext) ?
3318 0 : (IV)(rnext - rexi->program));
3321 next = scan + NEXT_OFF(scan);
3324 state_num = OP(scan);
3326 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3329 switch (state_num) {
3331 if (locinput == PL_bostr)
3333 /* reginfo->till = reginfo->bol; */
3338 if (locinput == PL_bostr ||
3339 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3345 if (locinput == PL_bostr)
3349 if (locinput == reginfo->ganch)
3354 /* update the startpoint */
3355 st->u.keeper.val = rex->offs[0].start;
3356 rex->offs[0].start = locinput - PL_bostr;
3357 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
3359 case KEEPS_next_fail:
3360 /* rollback the start point change */
3361 rex->offs[0].start = st->u.keeper.val;
3367 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3372 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3374 if (PL_regeol - locinput > 1)
3378 if (PL_regeol != locinput)
3382 if (!nextchr && locinput >= PL_regeol)
3385 locinput += PL_utf8skip[nextchr];
3386 if (locinput > PL_regeol)
3388 nextchr = UCHARAT(locinput);
3391 nextchr = UCHARAT(++locinput);
3394 if (!nextchr && locinput >= PL_regeol)
3396 nextchr = UCHARAT(++locinput);
3399 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3402 locinput += PL_utf8skip[nextchr];
3403 if (locinput > PL_regeol)
3405 nextchr = UCHARAT(locinput);
3408 nextchr = UCHARAT(++locinput);
3412 #define ST st->u.trie
3414 /* In this case the charclass data is available inline so
3415 we can fail fast without a lot of extra overhead.
3417 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3419 PerlIO_printf(Perl_debug_log,
3420 "%*s %sfailed to match trie start class...%s\n",
3421 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3424 assert(0); /* NOTREACHED */
3428 /* the basic plan of execution of the trie is:
3429 * At the beginning, run though all the states, and
3430 * find the longest-matching word. Also remember the position
3431 * of the shortest matching word. For example, this pattern:
3434 * when matched against the string "abcde", will generate
3435 * accept states for all words except 3, with the longest
3436 * matching word being 4, and the shortest being 1 (with
3437 * the position being after char 1 of the string).
3439 * Then for each matching word, in word order (i.e. 1,2,4,5),
3440 * we run the remainder of the pattern; on each try setting
3441 * the current position to the character following the word,
3442 * returning to try the next word on failure.
3444 * We avoid having to build a list of words at runtime by
3445 * using a compile-time structure, wordinfo[].prev, which
3446 * gives, for each word, the previous accepting word (if any).
3447 * In the case above it would contain the mappings 1->2, 2->0,
3448 * 3->0, 4->5, 5->1. We can use this table to generate, from
3449 * the longest word (4 above), a list of all words, by
3450 * following the list of prev pointers; this gives us the
3451 * unordered list 4,5,1,2. Then given the current word we have
3452 * just tried, we can go through the list and find the
3453 * next-biggest word to try (so if we just failed on word 2,
3454 * the next in the list is 4).
3456 * Since at runtime we don't record the matching position in
3457 * the string for each word, we have to work that out for
3458 * each word we're about to process. The wordinfo table holds
3459 * the character length of each word; given that we recorded
3460 * at the start: the position of the shortest word and its
3461 * length in chars, we just need to move the pointer the
3462 * difference between the two char lengths. Depending on
3463 * Unicode status and folding, that's cheap or expensive.
3465 * This algorithm is optimised for the case where are only a
3466 * small number of accept states, i.e. 0,1, or maybe 2.
3467 * With lots of accepts states, and having to try all of them,
3468 * it becomes quadratic on number of accept states to find all
3473 /* what type of TRIE am I? (utf8 makes this contextual) */
3474 DECL_TRIE_TYPE(scan);
3476 /* what trie are we using right now */
3477 reg_trie_data * const trie
3478 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3479 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3480 U32 state = trie->startstate;
3482 if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
3483 if (trie->states[ state ].wordnum) {
3485 PerlIO_printf(Perl_debug_log,
3486 "%*s %smatched empty string...%s\n",
3487 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3493 PerlIO_printf(Perl_debug_log,
3494 "%*s %sfailed to match trie start class...%s\n",
3495 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3502 U8 *uc = ( U8* )locinput;
3506 U8 *uscan = (U8*)NULL;
3507 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3508 U32 charcount = 0; /* how many input chars we have matched */
3509 U32 accepted = 0; /* have we seen any accepting states? */
3511 ST.jump = trie->jump;
3514 ST.longfold = FALSE; /* char longer if folded => it's harder */
3517 /* fully traverse the TRIE; note the position of the
3518 shortest accept state and the wordnum of the longest
3521 while ( state && uc <= (U8*)PL_regeol ) {
3522 U32 base = trie->states[ state ].trans.base;
3526 wordnum = trie->states[ state ].wordnum;
3528 if (wordnum) { /* it's an accept state */
3531 /* record first match position */
3533 ST.firstpos = (U8*)locinput;
3538 ST.firstchars = charcount;
3541 if (!ST.nextword || wordnum < ST.nextword)
3542 ST.nextword = wordnum;
3543 ST.topword = wordnum;
3546 DEBUG_TRIE_EXECUTE_r({
3547 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3548 PerlIO_printf( Perl_debug_log,
3549 "%*s %sState: %4"UVxf" Accepted: %c ",
3550 2+depth * 2, "", PL_colors[4],
3551 (UV)state, (accepted ? 'Y' : 'N'));
3554 /* read a char and goto next state */
3557 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3558 uscan, len, uvc, charid, foldlen,
3565 base + charid - 1 - trie->uniquecharcount)) >= 0)
3567 && ((U32)offset < trie->lasttrans)
3568 && trie->trans[offset].check == state)
3570 state = trie->trans[offset].next;
3581 DEBUG_TRIE_EXECUTE_r(
3582 PerlIO_printf( Perl_debug_log,
3583 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3584 charid, uvc, (UV)state, PL_colors[5] );
3590 /* calculate total number of accept states */
3595 w = trie->wordinfo[w].prev;
3598 ST.accepted = accepted;
3602 PerlIO_printf( Perl_debug_log,
3603 "%*s %sgot %"IVdf" possible matches%s\n",
3604 REPORT_CODE_OFF + depth * 2, "",
3605 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3607 goto trie_first_try; /* jump into the fail handler */
3609 assert(0); /* NOTREACHED */
3611 case TRIE_next_fail: /* we failed - try next alternative */
3615 REGCP_UNWIND(ST.cp);
3616 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
3618 if (!--ST.accepted) {
3620 PerlIO_printf( Perl_debug_log,
3621 "%*s %sTRIE failed...%s\n",
3622 REPORT_CODE_OFF+depth*2, "",
3629 /* Find next-highest word to process. Note that this code
3630 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3633 U16 const nextword = ST.nextword;
3634 reg_trie_wordinfo * const wordinfo
3635 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3636 for (word=ST.topword; word; word=wordinfo[word].prev) {
3637 if (word > nextword && (!min || word < min))
3650 ST.lastparen = rex->lastparen;
3651 ST.lastcloseparen = rex->lastcloseparen;
3655 /* find start char of end of current word */
3657 U32 chars; /* how many chars to skip */
3658 reg_trie_data * const trie
3659 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3661 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3663 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3668 /* the hard option - fold each char in turn and find
3669 * its folded length (which may be different */
3670 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3678 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3686 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3691 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3707 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
3708 ? ST.jump[ST.nextword]
3712 PerlIO_printf( Perl_debug_log,
3713 "%*s %sTRIE matched word #%d, continuing%s\n",
3714 REPORT_CODE_OFF+depth*2, "",
3721 if (ST.accepted > 1 || has_cutgroup) {
3722 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
3723 assert(0); /* NOTREACHED */
3725 /* only one choice left - just continue */
3727 AV *const trie_words
3728 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3729 SV ** const tmp = av_fetch( trie_words,
3731 SV *sv= tmp ? sv_newmortal() : NULL;
3733 PerlIO_printf( Perl_debug_log,
3734 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3735 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3737 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3738 PL_colors[0], PL_colors[1],
3739 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
3741 : "not compiled under -Dr",
3745 locinput = (char*)uc;
3746 nextchr = UCHARAT(locinput);
3747 continue; /* execute rest of RE */
3748 assert(0); /* NOTREACHED */
3753 char *s = STRING(scan);
3755 if (utf8_target != UTF_PATTERN) {
3756 /* The target and the pattern have differing utf8ness. */
3758 const char * const e = s + ln;
3761 /* The target is utf8, the pattern is not utf8.
3762 * Above-Latin1 code points can't match the pattern;
3763 * invariants match exactly, and the other Latin1 ones need
3764 * to be downgraded to a single byte in order to do the
3765 * comparison. (If we could be confident that the target
3766 * is not malformed, this could be refactored to have fewer
3767 * tests by just assuming that if the first bytes match, it
3768 * is an invariant, but there are tests in the test suite
3769 * dealing with (??{...}) which violate this) */
3773 if (UTF8_IS_ABOVE_LATIN1(* (U8*) l)) {
3776 if (UTF8_IS_INVARIANT(*(U8*)l)) {
3783 if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) {
3792 /* The target is not utf8, the pattern is utf8. */
3794 if (l >= PL_regeol || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
3798 if (UTF8_IS_INVARIANT(*(U8*)s)) {
3805 if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) {
3814 nextchr = UCHARAT(locinput);
3817 /* The target and the pattern have the same utf8ness. */
3818 /* Inline the first character, for speed. */
3819 if (UCHARAT(s) != nextchr)
3821 if (PL_regeol - locinput < ln)
3823 if (ln > 1 && memNE(s, locinput, ln))
3826 nextchr = UCHARAT(locinput);
3831 const U8 * fold_array;
3833 U32 fold_utf8_flags;
3835 PL_reg_flags |= RF_tainted;
3836 folder = foldEQ_locale;
3837 fold_array = PL_fold_locale;
3838 fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
3842 case EXACTFU_TRICKYFOLD:
3844 folder = foldEQ_latin1;
3845 fold_array = PL_fold_latin1;
3846 fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0;
3850 folder = foldEQ_latin1;
3851 fold_array = PL_fold_latin1;
3852 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
3857 fold_array = PL_fold;
3858 fold_utf8_flags = 0;
3864 if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) {
3865 /* Either target or the pattern are utf8, or has the issue where
3866 * the fold lengths may differ. */
3867 const char * const l = locinput;
3868 char *e = PL_regeol;
3870 if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN),
3871 l, &e, 0, utf8_target, fold_utf8_flags))
3876 nextchr = UCHARAT(locinput);
3880 /* Neither the target nor the pattern are utf8 */
3881 if (UCHARAT(s) != nextchr &&
3882 UCHARAT(s) != fold_array[nextchr])
3886 if (PL_regeol - locinput < ln)
3888 if (ln > 1 && ! folder(s, locinput, ln))
3891 nextchr = UCHARAT(locinput);
3895 /* XXX Could improve efficiency by separating these all out using a
3896 * macro or in-line function. At that point regcomp.c would no longer
3897 * have to set the FLAGS fields of these */
3900 PL_reg_flags |= RF_tainted;
3908 /* was last char in word? */
3910 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
3911 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
3913 if (locinput == PL_bostr)
3916 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3918 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3920 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
3921 ln = isALNUM_uni(ln);
3922 LOAD_UTF8_CHARCLASS_ALNUM();
3923 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3926 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3927 n = isALNUM_LC_utf8((U8*)locinput);
3932 /* Here the string isn't utf8, or is utf8 and only ascii
3933 * characters are to match \w. In the latter case looking at
3934 * the byte just prior to the current one may be just the final
3935 * byte of a multi-byte character. This is ok. There are two
3937 * 1) it is a single byte character, and then the test is doing
3938 * just what it's supposed to.
3939 * 2) it is a multi-byte character, in which case the final
3940 * byte is never mistakable for ASCII, and so the test
3941 * will say it is not a word character, which is the
3942 * correct answer. */
3943 ln = (locinput != PL_bostr) ?
3944 UCHARAT(locinput - 1) : '\n';
3945 switch (FLAGS(scan)) {
3946 case REGEX_UNICODE_CHARSET:
3947 ln = isWORDCHAR_L1(ln);
3948 n = isWORDCHAR_L1(nextchr);
3950 case REGEX_LOCALE_CHARSET:
3951 ln = isALNUM_LC(ln);
3952 n = isALNUM_LC(nextchr);
3954 case REGEX_DEPENDS_CHARSET:
3956 n = isALNUM(nextchr);
3958 case REGEX_ASCII_RESTRICTED_CHARSET:
3959 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3960 ln = isWORDCHAR_A(ln);
3961 n = isWORDCHAR_A(nextchr);
3964 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
3968 /* Note requires that all BOUNDs be lower than all NBOUNDs in
3970 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
3975 if (utf8_target || state_num == ANYOFV) {
3976 STRLEN inclasslen = PL_regeol - locinput;
3977 if (locinput >= PL_regeol)
3980 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3982 locinput += inclasslen;
3983 nextchr = UCHARAT(locinput);
3988 nextchr = UCHARAT(locinput);
3989 if (!nextchr && locinput >= PL_regeol)
3991 if (!REGINCLASS(rex, scan, (U8*)locinput))
3993 nextchr = UCHARAT(++locinput);
3997 /* Special char classes - The defines start on line 129 or so */
3998 CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR,
3999 ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8,
4000 ALNUMU, NALNUMU, isWORDCHAR_L1,
4001 ALNUMA, NALNUMA, isWORDCHAR_A,
4004 CCC_TRY_U(SPACE, NSPACE, isSPACE,
4005 SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8,
4006 SPACEU, NSPACEU, isSPACE_L1,
4007 SPACEA, NSPACEA, isSPACE_A,
4010 CCC_TRY(DIGIT, NDIGIT, isDIGIT,
4011 DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8,
4012 DIGITA, NDIGITA, isDIGIT_A,
4016 if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) {
4019 /* Matched a utf8-invariant, so don't have to worry about utf8 */
4020 nextchr = UCHARAT(++locinput);
4023 if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) {
4027 locinput += PL_utf8skip[nextchr];
4028 nextchr = UCHARAT(locinput);
4031 nextchr = UCHARAT(++locinput);
4035 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4036 a Unicode extended Grapheme Cluster */
4037 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4038 extended Grapheme Cluster is:
4041 | Prepend* Begin Extend*
4044 Begin is: ( Special_Begin | ! Control )
4045 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4046 Extend is: ( Grapheme_Extend | Spacing_Mark )
4047 Control is: [ GCB_Control CR LF ]
4048 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4050 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4053 Begin is ( Regular_Begin + Special Begin )
4055 It turns out that 98.4% of all Unicode code points match
4056 Regular_Begin. Doing it this way eliminates a table match in
4057 the previous implementation for almost all Unicode code points.
4059 There is a subtlety with Prepend* which showed up in testing.
4060 Note that the Begin, and only the Begin is required in:
4061 | Prepend* Begin Extend*
4062 Also, Begin contains '! Control'. A Prepend must be a
4063 '! Control', which means it must also be a Begin. What it
4064 comes down to is that if we match Prepend* and then find no
4065 suitable Begin afterwards, that if we backtrack the last
4066 Prepend, that one will be a suitable Begin.
4069 if (locinput >= PL_regeol)
4071 if (! utf8_target) {
4073 /* Match either CR LF or '.', as all the other possibilities
4075 locinput++; /* Match the . or CR */
4076 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4078 && locinput < PL_regeol
4079 && UCHARAT(locinput) == '\n') locinput++;
4083 /* Utf8: See if is ( CR LF ); already know that locinput <
4084 * PL_regeol, so locinput+1 is in bounds */
4085 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
4091 /* In case have to backtrack to beginning, then match '.' */
4092 char *starting = locinput;
4094 /* In case have to backtrack the last prepend */
4095 char *previous_prepend = 0;
4097 LOAD_UTF8_CHARCLASS_GCB();
4099 /* Match (prepend)* */
4100 while (locinput < PL_regeol
4101 && (len = is_GCB_Prepend_utf8(locinput)))
4103 previous_prepend = locinput;
4107 /* As noted above, if we matched a prepend character, but
4108 * the next thing won't match, back off the last prepend we
4109 * matched, as it is guaranteed to match the begin */
4110 if (previous_prepend
4111 && (locinput >= PL_regeol
4112 || (! swash_fetch(PL_utf8_X_regular_begin,
4113 (U8*)locinput, utf8_target)
4114 && ! is_GCB_SPECIAL_BEGIN_utf8(locinput)))
4117 locinput = previous_prepend;
4120 /* Note that here we know PL_regeol > locinput, as we
4121 * tested that upon input to this switch case, and if we
4122 * moved locinput forward, we tested the result just above
4123 * and it either passed, or we backed off so that it will
4125 if (swash_fetch(PL_utf8_X_regular_begin,
4126 (U8*)locinput, utf8_target)) {
4127 locinput += UTF8SKIP(locinput);
4129 else if (! is_GCB_SPECIAL_BEGIN_utf8(locinput)) {
4131 /* Here did not match the required 'Begin' in the
4132 * second term. So just match the very first
4133 * character, the '.' of the final term of the regex */
4134 locinput = starting + UTF8SKIP(starting);
4138 /* Here is a special begin. It can be composed of
4139 * several individual characters. One possibility is
4141 if ((len = is_GCB_RI_utf8(locinput))) {
4143 while (locinput < PL_regeol
4144 && (len = is_GCB_RI_utf8(locinput)))
4148 } else if ((len = is_GCB_T_utf8(locinput))) {
4149 /* Another possibility is T+ */
4151 while (locinput < PL_regeol
4152 && (len = is_GCB_T_utf8(locinput)))
4158 /* Here, neither RI+ nor T+; must be some other
4159 * Hangul. That means it is one of the others: L,
4160 * LV, LVT or V, and matches:
4161 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4164 while (locinput < PL_regeol
4165 && (len = is_GCB_L_utf8(locinput)))
4170 /* Here, have exhausted L*. If the next character
4171 * is not an LV, LVT nor V, it means we had to have
4172 * at least one L, so matches L+ in the original
4173 * equation, we have a complete hangul syllable.
4176 if (locinput < PL_regeol
4177 && is_GCB_LV_LVT_V_utf8(locinput))
4180 /* Otherwise keep going. Must be LV, LVT or V.
4182 if (is_utf8_X_LVT((U8*)locinput)) {
4183 locinput += UTF8SKIP(locinput);
4186 /* Must be V or LV. Take it, then match
4188 locinput += UTF8SKIP(locinput);
4189 while (locinput < PL_regeol
4190 && (len = is_GCB_V_utf8(locinput)))
4196 /* And any of LV, LVT, or V can be followed
4198 while (locinput < PL_regeol
4199 && (len = is_GCB_T_utf8(locinput)))
4207 /* Match any extender */
4208 while (locinput < PL_regeol
4209 && swash_fetch(PL_utf8_X_extend,
4210 (U8*)locinput, utf8_target))
4212 locinput += UTF8SKIP(locinput);
4216 if (locinput > PL_regeol) sayNO;
4218 nextchr = UCHARAT(locinput);
4222 { /* The capture buffer cases. The ones beginning with N for the
4223 named buffers just convert to the equivalent numbered and
4224 pretend they were called as the corresponding numbered buffer
4226 /* don't initialize these in the declaration, it makes C++
4231 const U8 *fold_array;
4234 PL_reg_flags |= RF_tainted;
4235 folder = foldEQ_locale;
4236 fold_array = PL_fold_locale;
4238 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4242 folder = foldEQ_latin1;
4243 fold_array = PL_fold_latin1;
4245 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4249 folder = foldEQ_latin1;
4250 fold_array = PL_fold_latin1;
4252 utf8_fold_flags = 0;
4257 fold_array = PL_fold;
4259 utf8_fold_flags = 0;
4266 utf8_fold_flags = 0;
4269 /* For the named back references, find the corresponding buffer
4271 n = reg_check_named_buff_matched(rex,scan);
4276 goto do_nref_ref_common;
4279 PL_reg_flags |= RF_tainted;
4280 folder = foldEQ_locale;
4281 fold_array = PL_fold_locale;
4282 utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
4286 folder = foldEQ_latin1;
4287 fold_array = PL_fold_latin1;
4288 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4292 folder = foldEQ_latin1;
4293 fold_array = PL_fold_latin1;
4294 utf8_fold_flags = 0;
4299 fold_array = PL_fold;
4300 utf8_fold_flags = 0;
4306 utf8_fold_flags = 0;
4310 n = ARG(scan); /* which paren pair */
4313 ln = rex->offs[n].start;
4314 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4315 if (rex->lastparen < n || ln == -1)
4316 sayNO; /* Do not match unless seen CLOSEn. */
4317 if (ln == rex->offs[n].end)
4321 if (type != REF /* REF can do byte comparison */
4322 && (utf8_target || type == REFFU))
4323 { /* XXX handle REFFL better */
4324 char * limit = PL_regeol;
4326 /* This call case insensitively compares the entire buffer
4327 * at s, with the current input starting at locinput, but
4328 * not going off the end given by PL_regeol, and returns in
4329 * limit upon success, how much of the current input was
4331 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
4332 locinput, &limit, 0, utf8_target, utf8_fold_flags))
4337 nextchr = UCHARAT(locinput);
4341 /* Not utf8: Inline the first character, for speed. */
4342 if (UCHARAT(s) != nextchr &&
4344 UCHARAT(s) != fold_array[nextchr]))
4346 ln = rex->offs[n].end - ln;
4347 if (locinput + ln > PL_regeol)
4349 if (ln > 1 && (type == REF
4350 ? memNE(s, locinput, ln)
4351 : ! folder(s, locinput, ln)))
4354 nextchr = UCHARAT(locinput);
4364 #define ST st->u.eval
4369 regexp_internal *rei;
4370 regnode *startpoint;
4373 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
4374 if (cur_eval && cur_eval->locinput==locinput) {
4375 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
4376 Perl_croak(aTHX_ "Infinite recursion in regex");
4377 if ( ++nochange_depth > max_nochange_depth )
4379 "Pattern subroutine nesting without pos change"
4380 " exceeded limit in regex");
4387 if (OP(scan)==GOSUB) {
4388 startpoint = scan + ARG2L(scan);
4389 ST.close_paren = ARG(scan);
4391 startpoint = rei->program+1;
4394 goto eval_recurse_doit;
4395 assert(0); /* NOTREACHED */
4396 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4397 if (cur_eval && cur_eval->locinput==locinput) {
4398 if ( ++nochange_depth > max_nochange_depth )
4399 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4404 /* execute the code in the {...} */
4408 OP * const oop = PL_op;
4409 COP * const ocurcop = PL_curcop;
4411 char *saved_regeol = PL_regeol;
4412 struct re_save_state saved_state;
4415 /* save *all* paren positions */
4417 REGCP_SET(runops_cp);
4419 /* To not corrupt the existing regex state while executing the
4420 * eval we would normally put it on the save stack, like with
4421 * save_re_context. However, re-evals have a weird scoping so we
4422 * can't just add ENTER/LEAVE here. With that, things like
4424 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4426 * would break, as they expect the localisation to be unwound
4427 * only when the re-engine backtracks through the bit that
4430 * What we do instead is just saving the state in a local c
4433 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4435 PL_reg_state.re_reparsing = FALSE;
4438 caller_cv = find_runcv(NULL);
4442 if (rexi->data->what[n] == 'r') { /* code from an external qr */
4443 newcv = ((struct regexp *)SvANY(
4444 (REGEXP*)(rexi->data->data[n])
4447 nop = (OP*)rexi->data->data[n+1];
4449 else if (rexi->data->what[n] == 'l') { /* literal code */
4451 nop = (OP*)rexi->data->data[n];
4452 assert(CvDEPTH(newcv));
4455 /* literal with own CV */
4456 assert(rexi->data->what[n] == 'L');
4457 newcv = rex->qr_anoncv;
4458 nop = (OP*)rexi->data->data[n];
4461 /* normally if we're about to execute code from the same
4462 * CV that we used previously, we just use the existing
4463 * CX stack entry. However, its possible that in the
4464 * meantime we may have backtracked, popped from the save
4465 * stack, and undone the SAVECOMPPAD(s) associated with
4466 * PUSH_MULTICALL; in which case PL_comppad no longer
4467 * points to newcv's pad. */
4468 if (newcv != last_pushed_cv || PL_comppad != last_pad)
4470 I32 depth = (newcv == caller_cv) ? 0 : 1;
4471 if (last_pushed_cv) {
4472 CHANGE_MULTICALL_WITHDEPTH(newcv, depth);
4475 PUSH_MULTICALL_WITHDEPTH(newcv, depth);
4477 last_pushed_cv = newcv;
4479 last_pad = PL_comppad;
4481 /* the initial nextstate you would normally execute
4482 * at the start of an eval (which would cause error
4483 * messages to come from the eval), may be optimised
4484 * away from the execution path in the regex code blocks;
4485 * so manually set PL_curcop to it initially */
4487 OP *o = cUNOPx(nop)->op_first;
4488 assert(o->op_type == OP_NULL);
4489 if (o->op_targ == OP_SCOPE) {
4490 o = cUNOPo->op_first;
4493 assert(o->op_targ == OP_LEAVE);
4494 o = cUNOPo->op_first;
4495 assert(o->op_type == OP_ENTER);
4499 if (o->op_type != OP_STUB) {
4500 assert( o->op_type == OP_NEXTSTATE
4501 || o->op_type == OP_DBSTATE
4502 || (o->op_type == OP_NULL
4503 && ( o->op_targ == OP_NEXTSTATE
4504 || o->op_targ == OP_DBSTATE
4508 PL_curcop = (COP*)o;
4513 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4514 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
4516 rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4519 SV *sv_mrk = get_sv("REGMARK", 1);
4520 sv_setsv(sv_mrk, sv_yes_mark);
4523 /* we don't use MULTICALL here as we want to call the
4524 * first op of the block of interest, rather than the
4525 * first op of the sub */
4528 CALLRUNOPS(aTHX); /* Scalar context. */
4531 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4537 /* before restoring everything, evaluate the returned
4538 * value, so that 'uninit' warnings don't use the wrong
4539 * PL_op or pad. Also need to process any magic vars
4540 * (e.g. $1) *before* parentheses are restored */
4545 if (logical == 0) /* (?{})/ */
4546 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
4547 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
4548 sw = cBOOL(SvTRUE(ret));
4551 else { /* /(??{}) */
4552 /* if its overloaded, let the regex compiler handle
4553 * it; otherwise extract regex, or stringify */
4554 if (!SvAMAGIC(ret)) {
4558 if (SvTYPE(sv) == SVt_REGEXP)
4559 re_sv = (REGEXP*) sv;
4560 else if (SvSMAGICAL(sv)) {
4561 MAGIC *mg = mg_find(sv, PERL_MAGIC_qr);
4563 re_sv = (REGEXP *) mg->mg_obj;
4566 /* force any magic, undef warnings here */
4568 ret = sv_mortalcopy(ret);
4569 (void) SvPV_force_nolen(ret);
4575 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4577 /* *** Note that at this point we don't restore
4578 * PL_comppad, (or pop the CxSUB) on the assumption it may
4579 * be used again soon. This is safe as long as nothing
4580 * in the regexp code uses the pad ! */
4582 PL_curcop = ocurcop;
4583 PL_regeol = saved_regeol;
4584 S_regcp_restore(aTHX_ rex, runops_cp);
4590 /* only /(??{})/ from now on */
4593 /* extract RE object from returned value; compiling if
4597 re_sv = reg_temp_copy(NULL, re_sv);
4601 const I32 osize = PL_regsize;
4603 if (SvUTF8(ret) && IN_BYTES) {
4604 /* In use 'bytes': make a copy of the octet
4605 * sequence, but without the flag on */
4607 const char *const p = SvPV(ret, len);
4608 ret = newSVpvn_flags(p, len, SVs_TEMP);
4610 if (rex->intflags & PREGf_USE_RE_EVAL)
4611 pm_flags |= PMf_USE_RE_EVAL;
4613 /* if we got here, it should be an engine which
4614 * supports compiling code blocks and stuff */
4615 assert(rex->engine && rex->engine->op_comp);
4616 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
4617 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
4618 rex->engine, NULL, NULL,
4619 /* copy /msix etc to inner pattern */
4624 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4626 /* This isn't a first class regexp. Instead, it's
4627 caching a regexp onto an existing, Perl visible
4629 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
4632 /* safe to do now that any $1 etc has been
4633 * interpolated into the new pattern string and
4635 S_regcp_restore(aTHX_ rex, runops_cp);
4637 re = (struct regexp *)SvANY(re_sv);
4639 RXp_MATCH_COPIED_off(re);
4640 re->subbeg = rex->subbeg;
4641 re->sublen = rex->sublen;
4642 re->suboffset = rex->suboffset;
4643 re->subcoffset = rex->subcoffset;
4646 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4647 "Matching embedded");
4649 startpoint = rei->program + 1;
4650 ST.close_paren = 0; /* only used for GOSUB */
4652 eval_recurse_doit: /* Share code with GOSUB below this line */
4653 /* run the pattern returned from (??{...}) */
4654 ST.cp = regcppush(rex, 0); /* Save *all* the positions. */
4655 REGCP_SET(ST.lastcp);
4658 re->lastcloseparen = 0;
4662 /* XXXX This is too dramatic a measure... */
4665 ST.toggle_reg_flags = PL_reg_flags;
4667 PL_reg_flags |= RF_utf8;
4669 PL_reg_flags &= ~RF_utf8;
4670 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4672 ST.prev_rex = rex_sv;
4673 ST.prev_curlyx = cur_curlyx;
4675 SET_reg_curpm(rex_sv);
4680 ST.prev_eval = cur_eval;
4682 /* now continue from first node in postoned RE */
4683 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
4684 assert(0); /* NOTREACHED */
4687 case EVAL_AB: /* cleanup after a successful (??{A})B */
4688 /* note: this is called twice; first after popping B, then A */
4689 PL_reg_flags ^= ST.toggle_reg_flags;
4690 rex_sv = ST.prev_rex;
4691 SET_reg_curpm(rex_sv);
4692 rex = (struct regexp *)SvANY(rex_sv);
4693 rexi = RXi_GET(rex);
4695 cur_eval = ST.prev_eval;
4696 cur_curlyx = ST.prev_curlyx;
4698 /* XXXX This is too dramatic a measure... */
4700 if ( nochange_depth )
4705 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4706 /* note: this is called twice; first after popping B, then A */
4707 PL_reg_flags ^= ST.toggle_reg_flags;
4708 rex_sv = ST.prev_rex;
4709 SET_reg_curpm(rex_sv);
4710 rex = (struct regexp *)SvANY(rex_sv);
4711 rexi = RXi_GET(rex);
4713 REGCP_UNWIND(ST.lastcp);
4715 cur_eval = ST.prev_eval;
4716 cur_curlyx = ST.prev_curlyx;
4717 /* XXXX This is too dramatic a measure... */
4719 if ( nochange_depth )
4725 n = ARG(scan); /* which paren pair */
4726 rex->offs[n].start_tmp = locinput - PL_bostr;
4729 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
4730 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n",
4734 (IV)rex->offs[n].start_tmp,
4740 /* XXX really need to log other places start/end are set too */
4741 #define CLOSE_CAPTURE \
4742 rex->offs[n].start = rex->offs[n].start_tmp; \
4743 rex->offs[n].end = locinput - PL_bostr; \
4744 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
4745 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
4747 PTR2UV(rex->offs), \
4749 (IV)rex->offs[n].start, \
4750 (IV)rex->offs[n].end \
4754 n = ARG(scan); /* which paren pair */
4756 /*if (n > PL_regsize)
4758 if (n > rex->lastparen)
4760 rex->lastcloseparen = n;
4761 if (cur_eval && cur_eval->u.eval.close_paren == n) {
4769 cursor && OP(cursor)!=END;
4770 cursor=regnext(cursor))
4772 if ( OP(cursor)==CLOSE ){
4774 if ( n <= lastopen ) {
4776 /*if (n > PL_regsize)
4778 if (n > rex->lastparen)
4780 rex->lastcloseparen = n;
4781 if ( n == ARG(scan) || (cur_eval &&
4782 cur_eval->u.eval.close_paren == n))
4791 n = ARG(scan); /* which paren pair */
4792 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
4795 /* reg_check_named_buff_matched returns 0 for no match */
4796 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4800 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4806 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4808 next = NEXTOPER(NEXTOPER(scan));
4810 next = scan + ARG(scan);
4811 if (OP(next) == IFTHEN) /* Fake one. */
4812 next = NEXTOPER(NEXTOPER(next));
4816 logical = scan->flags;
4819 /*******************************************************************
4821 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4822 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4823 STAR/PLUS/CURLY/CURLYN are used instead.)
4825 A*B is compiled as <CURLYX><A><WHILEM><B>
4827 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4828 state, which contains the current count, initialised to -1. It also sets
4829 cur_curlyx to point to this state, with any previous value saved in the
4832 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4833 since the pattern may possibly match zero times (i.e. it's a while {} loop
4834 rather than a do {} while loop).
4836 Each entry to WHILEM represents a successful match of A. The count in the
4837 CURLYX block is incremented, another WHILEM state is pushed, and execution
4838 passes to A or B depending on greediness and the current count.
4840 For example, if matching against the string a1a2a3b (where the aN are
4841 substrings that match /A/), then the match progresses as follows: (the
4842 pushed states are interspersed with the bits of strings matched so far):
4845 <CURLYX cnt=0><WHILEM>
4846 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4847 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4848 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4849 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4851 (Contrast this with something like CURLYM, which maintains only a single
4855 a1 <CURLYM cnt=1> a2
4856 a1 a2 <CURLYM cnt=2> a3
4857 a1 a2 a3 <CURLYM cnt=3> b
4860 Each WHILEM state block marks a point to backtrack to upon partial failure
4861 of A or B, and also contains some minor state data related to that
4862 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4863 overall state, such as the count, and pointers to the A and B ops.
4865 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4866 must always point to the *current* CURLYX block, the rules are:
4868 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4869 and set cur_curlyx to point the new block.
4871 When popping the CURLYX block after a successful or unsuccessful match,
4872 restore the previous cur_curlyx.
4874 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4875 to the outer one saved in the CURLYX block.
4877 When popping the WHILEM block after a successful or unsuccessful B match,
4878 restore the previous cur_curlyx.
4880 Here's an example for the pattern (AI* BI)*BO
4881 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4884 curlyx backtrack stack
4885 ------ ---------------
4887 CO <CO prev=NULL> <WO>
4888 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4889 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4890 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4892 At this point the pattern succeeds, and we work back down the stack to
4893 clean up, restoring as we go:
4895 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4896 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4897 CO <CO prev=NULL> <WO>
4900 *******************************************************************/
4902 #define ST st->u.curlyx
4904 case CURLYX: /* start of /A*B/ (for complex A) */
4906 /* No need to save/restore up to this paren */
4907 I32 parenfloor = scan->flags;
4909 assert(next); /* keep Coverity happy */
4910 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4913 /* XXXX Probably it is better to teach regpush to support
4914 parenfloor > PL_regsize... */
4915 if (parenfloor > (I32)rex->lastparen)
4916 parenfloor = rex->lastparen; /* Pessimization... */
4918 ST.prev_curlyx= cur_curlyx;
4920 ST.cp = PL_savestack_ix;
4922 /* these fields contain the state of the current curly.
4923 * they are accessed by subsequent WHILEMs */
4924 ST.parenfloor = parenfloor;
4929 ST.count = -1; /* this will be updated by WHILEM */
4930 ST.lastloc = NULL; /* this will be updated by WHILEM */
4932 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
4933 assert(0); /* NOTREACHED */
4936 case CURLYX_end: /* just finished matching all of A*B */
4937 cur_curlyx = ST.prev_curlyx;
4939 assert(0); /* NOTREACHED */
4941 case CURLYX_end_fail: /* just failed to match all of A*B */
4943 cur_curlyx = ST.prev_curlyx;
4945 assert(0); /* NOTREACHED */
4949 #define ST st->u.whilem
4951 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4953 /* see the discussion above about CURLYX/WHILEM */
4955 int min = ARG1(cur_curlyx->u.curlyx.me);
4956 int max = ARG2(cur_curlyx->u.curlyx.me);
4957 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4959 assert(cur_curlyx); /* keep Coverity happy */
4960 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4961 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4962 ST.cache_offset = 0;
4966 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4967 "%*s whilem: matched %ld out of %d..%d\n",
4968 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4971 /* First just match a string of min A's. */
4974 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
4975 cur_curlyx->u.curlyx.lastloc = locinput;
4976 REGCP_SET(ST.lastcp);
4978 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
4979 assert(0); /* NOTREACHED */
4982 /* If degenerate A matches "", assume A done. */
4984 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4985 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4986 "%*s whilem: empty match detected, trying continuation...\n",
4987 REPORT_CODE_OFF+depth*2, "")
4989 goto do_whilem_B_max;
4992 /* super-linear cache processing */
4996 if (!PL_reg_maxiter) {
4997 /* start the countdown: Postpone detection until we
4998 * know the match is not *that* much linear. */
4999 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
5000 /* possible overflow for long strings and many CURLYX's */
5001 if (PL_reg_maxiter < 0)
5002 PL_reg_maxiter = I32_MAX;
5003 PL_reg_leftiter = PL_reg_maxiter;
5006 if (PL_reg_leftiter-- == 0) {
5007 /* initialise cache */
5008 const I32 size = (PL_reg_maxiter + 7)/8;
5009 if (PL_reg_poscache) {
5010 if ((I32)PL_reg_poscache_size < size) {
5011 Renew(PL_reg_poscache, size, char);
5012 PL_reg_poscache_size = size;
5014 Zero(PL_reg_poscache, size, char);
5017 PL_reg_poscache_size = size;
5018 Newxz(PL_reg_poscache, size, char);
5020 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5021 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5022 PL_colors[4], PL_colors[5])
5026 if (PL_reg_leftiter < 0) {
5027 /* have we already failed at this position? */
5029 offset = (scan->flags & 0xf) - 1
5030 + (locinput - PL_bostr) * (scan->flags>>4);
5031 mask = 1 << (offset % 8);
5033 if (PL_reg_poscache[offset] & mask) {
5034 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5035 "%*s whilem: (cache) already tried at this position...\n",
5036 REPORT_CODE_OFF+depth*2, "")
5038 sayNO; /* cache records failure */
5040 ST.cache_offset = offset;
5041 ST.cache_mask = mask;
5045 /* Prefer B over A for minimal matching. */
5047 if (cur_curlyx->u.curlyx.minmod) {
5048 ST.save_curlyx = cur_curlyx;
5049 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5050 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor);
5051 REGCP_SET(ST.lastcp);
5052 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5054 assert(0); /* NOTREACHED */
5057 /* Prefer A over B for maximal matching. */
5059 if (n < max) { /* More greed allowed? */
5060 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5061 cur_curlyx->u.curlyx.lastloc = locinput;
5062 REGCP_SET(ST.lastcp);
5063 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5064 assert(0); /* NOTREACHED */
5066 goto do_whilem_B_max;
5068 assert(0); /* NOTREACHED */
5070 case WHILEM_B_min: /* just matched B in a minimal match */
5071 case WHILEM_B_max: /* just matched B in a maximal match */
5072 cur_curlyx = ST.save_curlyx;
5074 assert(0); /* NOTREACHED */
5076 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5077 cur_curlyx = ST.save_curlyx;
5078 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5079 cur_curlyx->u.curlyx.count--;
5081 assert(0); /* NOTREACHED */
5083 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5085 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5086 REGCP_UNWIND(ST.lastcp);
5088 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5089 cur_curlyx->u.curlyx.count--;
5091 assert(0); /* NOTREACHED */
5093 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5094 REGCP_UNWIND(ST.lastcp);
5095 regcppop(rex); /* Restore some previous $<digit>s? */
5096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5097 "%*s whilem: failed, trying continuation...\n",
5098 REPORT_CODE_OFF+depth*2, "")
5101 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5102 && ckWARN(WARN_REGEXP)
5103 && !(PL_reg_flags & RF_warned))
5105 PL_reg_flags |= RF_warned;
5106 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5107 "Complex regular subexpression recursion limit (%d) "
5113 ST.save_curlyx = cur_curlyx;
5114 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5115 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5117 assert(0); /* NOTREACHED */
5119 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5120 cur_curlyx = ST.save_curlyx;
5121 REGCP_UNWIND(ST.lastcp);
5124 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5125 /* Maximum greed exceeded */
5126 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5127 && ckWARN(WARN_REGEXP)
5128 && !(PL_reg_flags & RF_warned))
5130 PL_reg_flags |= RF_warned;
5131 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5132 "Complex regular subexpression recursion "
5133 "limit (%d) exceeded",
5136 cur_curlyx->u.curlyx.count--;
5140 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5141 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
5143 /* Try grabbing another A and see if it helps. */
5144 cur_curlyx->u.curlyx.lastloc = locinput;
5145 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor);
5146 REGCP_SET(ST.lastcp);
5147 PUSH_STATE_GOTO(WHILEM_A_min,
5148 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
5150 assert(0); /* NOTREACHED */
5153 #define ST st->u.branch
5155 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
5156 next = scan + ARG(scan);
5159 scan = NEXTOPER(scan);
5162 case BRANCH: /* /(...|A|...)/ */
5163 scan = NEXTOPER(scan); /* scan now points to inner node */
5164 ST.lastparen = rex->lastparen;
5165 ST.lastcloseparen = rex->lastcloseparen;
5166 ST.next_branch = next;
5169 /* Now go into the branch */
5171 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
5173 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
5175 assert(0); /* NOTREACHED */
5177 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
5178 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5179 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
5180 assert(0); /* NOTREACHED */
5181 case CUTGROUP_next_fail:
5184 if (st->u.mark.mark_name)
5185 sv_commit = st->u.mark.mark_name;
5187 assert(0); /* NOTREACHED */
5190 assert(0); /* NOTREACHED */
5191 case BRANCH_next_fail: /* that branch failed; try the next, if any */
5196 REGCP_UNWIND(ST.cp);
5197 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5198 scan = ST.next_branch;
5199 /* no more branches? */
5200 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
5202 PerlIO_printf( Perl_debug_log,
5203 "%*s %sBRANCH failed...%s\n",
5204 REPORT_CODE_OFF+depth*2, "",
5210 continue; /* execute next BRANCH[J] op */
5211 assert(0); /* NOTREACHED */
5218 #define ST st->u.curlym
5220 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
5222 /* This is an optimisation of CURLYX that enables us to push
5223 * only a single backtracking state, no matter how many matches
5224 * there are in {m,n}. It relies on the pattern being constant
5225 * length, with no parens to influence future backrefs
5229 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5231 ST.lastparen = rex->lastparen;
5232 ST.lastcloseparen = rex->lastcloseparen;
5234 /* if paren positive, emulate an OPEN/CLOSE around A */
5236 U32 paren = ST.me->flags;
5237 if (paren > PL_regsize)
5239 scan += NEXT_OFF(scan); /* Skip former OPEN. */
5247 ST.c1 = CHRTEST_UNINIT;
5250 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
5253 curlym_do_A: /* execute the A in /A{m,n}B/ */
5254 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
5255 assert(0); /* NOTREACHED */
5257 case CURLYM_A: /* we've just matched an A */
5259 /* after first match, determine A's length: u.curlym.alen */
5260 if (ST.count == 1) {
5261 if (PL_reg_match_utf8) {
5262 char *s = st->locinput;
5263 while (s < locinput) {
5269 ST.alen = locinput - st->locinput;
5272 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
5275 PerlIO_printf(Perl_debug_log,
5276 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
5277 (int)(REPORT_CODE_OFF+(depth*2)), "",
5278 (IV) ST.count, (IV)ST.alen)
5281 if (cur_eval && cur_eval->u.eval.close_paren &&
5282 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5286 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
5287 if ( max == REG_INFTY || ST.count < max )
5288 goto curlym_do_A; /* try to match another A */
5290 goto curlym_do_B; /* try to match B */
5292 case CURLYM_A_fail: /* just failed to match an A */
5293 REGCP_UNWIND(ST.cp);
5295 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
5296 || (cur_eval && cur_eval->u.eval.close_paren &&
5297 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
5300 curlym_do_B: /* execute the B in /A{m,n}B/ */
5301 if (ST.c1 == CHRTEST_UNINIT) {
5302 /* calculate c1 and c2 for possible match of 1st char
5303 * following curly */
5304 ST.c1 = ST.c2 = CHRTEST_VOID;
5305 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
5306 regnode *text_node = ST.B;
5307 if (! HAS_TEXT(text_node))
5308 FIND_NEXT_IMPT(text_node);
5311 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
5313 But the former is redundant in light of the latter.
5315 if this changes back then the macro for
5316 IS_TEXT and friends need to change.
5318 if (PL_regkind[OP(text_node)] == EXACT)
5321 ST.c1 = (U8)*STRING(text_node);
5322 switch (OP(text_node)) {
5323 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5326 case EXACTFU_TRICKYFOLD:
5327 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5328 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5329 default: ST.c2 = ST.c1;
5336 PerlIO_printf(Perl_debug_log,
5337 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
5338 (int)(REPORT_CODE_OFF+(depth*2)),
5341 if (ST.c1 != CHRTEST_VOID
5343 && nextchr != ST.c2)
5345 /* simulate B failing */
5347 PerlIO_printf(Perl_debug_log,
5348 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
5349 (int)(REPORT_CODE_OFF+(depth*2)),"",
5352 state_num = CURLYM_B_fail;
5353 goto reenter_switch;
5357 /* emulate CLOSE: mark current A as captured */
5358 I32 paren = ST.me->flags;
5360 rex->offs[paren].start
5361 = HOPc(locinput, -ST.alen) - PL_bostr;
5362 rex->offs[paren].end = locinput - PL_bostr;
5363 if ((U32)paren > rex->lastparen)
5364 rex->lastparen = paren;
5365 rex->lastcloseparen = paren;
5368 rex->offs[paren].end = -1;
5369 if (cur_eval && cur_eval->u.eval.close_paren &&
5370 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
5379 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
5380 assert(0); /* NOTREACHED */
5382 case CURLYM_B_fail: /* just failed to match a B */
5383 REGCP_UNWIND(ST.cp);
5384 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5386 I32 max = ARG2(ST.me);
5387 if (max != REG_INFTY && ST.count == max)
5389 goto curlym_do_A; /* try to match a further A */
5391 /* backtrack one A */
5392 if (ST.count == ARG1(ST.me) /* min */)
5395 locinput = HOPc(locinput, -ST.alen);
5396 nextchr = UCHARAT(locinput);
5397 goto curlym_do_B; /* try to match B */
5400 #define ST st->u.curly
5402 #define CURLY_SETPAREN(paren, success) \
5405 rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \
5406 rex->offs[paren].end = locinput - PL_bostr; \
5407 if (paren > rex->lastparen) \
5408 rex->lastparen = paren; \
5409 rex->lastcloseparen = paren; \
5412 rex->offs[paren].end = -1; \
5413 rex->lastparen = ST.lastparen; \
5414 rex->lastcloseparen = ST.lastcloseparen; \
5418 case STAR: /* /A*B/ where A is width 1 */
5422 scan = NEXTOPER(scan);
5424 case PLUS: /* /A+B/ where A is width 1 */
5428 scan = NEXTOPER(scan);
5430 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
5431 ST.paren = scan->flags; /* Which paren to set */
5432 ST.lastparen = rex->lastparen;
5433 ST.lastcloseparen = rex->lastcloseparen;
5434 if (ST.paren > PL_regsize)
5435 PL_regsize = ST.paren;
5436 ST.min = ARG1(scan); /* min to match */
5437 ST.max = ARG2(scan); /* max to match */
5438 if (cur_eval && cur_eval->u.eval.close_paren &&
5439 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5443 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
5445 case CURLY: /* /A{m,n}B/ where A is width 1 */
5447 ST.min = ARG1(scan); /* min to match */
5448 ST.max = ARG2(scan); /* max to match */
5449 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
5452 * Lookahead to avoid useless match attempts
5453 * when we know what character comes next.
5455 * Used to only do .*x and .*?x, but now it allows
5456 * for )'s, ('s and (?{ ... })'s to be in the way
5457 * of the quantifier and the EXACT-like node. -- japhy
5460 if (ST.min > ST.max) /* XXX make this a compile-time check? */
5462 if (HAS_TEXT(next) || JUMPABLE(next)) {
5464 regnode *text_node = next;
5466 if (! HAS_TEXT(text_node))
5467 FIND_NEXT_IMPT(text_node);
5469 if (! HAS_TEXT(text_node))
5470 ST.c1 = ST.c2 = CHRTEST_VOID;
5472 if ( PL_regkind[OP(text_node)] != EXACT ) {
5473 ST.c1 = ST.c2 = CHRTEST_VOID;
5474 goto assume_ok_easy;
5477 s = (U8*)STRING(text_node);
5479 /* Currently we only get here when
5481 PL_rekind[OP(text_node)] == EXACT
5483 if this changes back then the macro for IS_TEXT and
5484 friends need to change. */
5487 switch (OP(text_node)) {
5488 case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
5491 case EXACTFU_TRICKYFOLD:
5492 case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
5493 case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
5494 default: ST.c2 = ST.c1; break;
5497 else { /* UTF_PATTERN */
5498 if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) {
5500 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5502 to_utf8_fold((U8*)s, tmpbuf, &ulen);
5503 ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0,
5507 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5514 ST.c1 = ST.c2 = CHRTEST_VOID;
5520 char *li = locinput;
5522 if (ST.min && regrepeat(rex, &li, ST.A, ST.min, depth) < ST.min)
5525 nextchr = UCHARAT(locinput);
5528 if (ST.c1 == CHRTEST_VOID)
5529 goto curly_try_B_min;
5531 ST.oldloc = locinput;
5533 /* set ST.maxpos to the furthest point along the
5534 * string that could possibly match */
5535 if (ST.max == REG_INFTY) {
5536 ST.maxpos = PL_regeol - 1;
5538 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5541 else if (utf8_target) {
5542 int m = ST.max - ST.min;
5543 for (ST.maxpos = locinput;
5544 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5545 ST.maxpos += UTF8SKIP(ST.maxpos);
5548 ST.maxpos = locinput + ST.max - ST.min;
5549 if (ST.maxpos >= PL_regeol)
5550 ST.maxpos = PL_regeol - 1;
5552 goto curly_try_B_min_known;
5556 /* avoid taking address of locinput, so it can remain
5558 char *li = locinput;
5559 ST.count = regrepeat(rex, &li, ST.A, ST.max, depth);
5560 if (ST.count < ST.min)
5563 nextchr = UCHARAT(locinput);
5564 if ((ST.count > ST.min)
5565 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5567 /* A{m,n} must come at the end of the string, there's
5568 * no point in backing off ... */
5570 /* ...except that $ and \Z can match before *and* after
5571 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5572 We may back off by one in this case. */
5573 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
5577 goto curly_try_B_max;
5579 assert(0); /* NOTREACHED */
5582 case CURLY_B_min_known_fail:
5583 /* failed to find B in a non-greedy match where c1,c2 valid */
5585 REGCP_UNWIND(ST.cp);
5587 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5589 /* Couldn't or didn't -- move forward. */
5590 ST.oldloc = locinput;
5592 locinput += UTF8SKIP(locinput);
5596 curly_try_B_min_known:
5597 /* find the next place where 'B' could work, then call B */
5601 n = (ST.oldloc == locinput) ? 0 : 1;
5602 if (ST.c1 == ST.c2) {
5604 /* set n to utf8_distance(oldloc, locinput) */
5605 while (locinput <= ST.maxpos &&
5606 utf8n_to_uvchr((U8*)locinput,
5607 UTF8_MAXBYTES, &len,
5608 uniflags) != (UV)ST.c1) {
5614 /* set n to utf8_distance(oldloc, locinput) */
5615 while (locinput <= ST.maxpos) {
5617 const UV c = utf8n_to_uvchr((U8*)locinput,
5618 UTF8_MAXBYTES, &len,
5620 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5628 if (ST.c1 == ST.c2) {
5629 while (locinput <= ST.maxpos &&
5630 UCHARAT(locinput) != ST.c1)
5634 while (locinput <= ST.maxpos
5635 && UCHARAT(locinput) != ST.c1
5636 && UCHARAT(locinput) != ST.c2)
5639 n = locinput - ST.oldloc;
5641 if (locinput > ST.maxpos)
5644 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
5645 * at b; check that everything between oldloc and
5646 * locinput matches */
5647 char *li = ST.oldloc;
5649 if (regrepeat(rex, &li, ST.A, n, depth) < n)
5651 assert(n == REG_INFTY || locinput == li);
5653 CURLY_SETPAREN(ST.paren, ST.count);
5654 if (cur_eval && cur_eval->u.eval.close_paren &&
5655 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5658 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
5660 assert(0); /* NOTREACHED */
5663 case CURLY_B_min_fail:
5664 /* failed to find B in a non-greedy match where c1,c2 invalid */
5666 REGCP_UNWIND(ST.cp);
5668 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5670 /* failed -- move forward one */
5672 char *li = locinput;
5673 if (!regrepeat(rex, &li, ST.A, 1, depth)) {
5680 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5681 ST.count > 0)) /* count overflow ? */
5684 CURLY_SETPAREN(ST.paren, ST.count);
5685 if (cur_eval && cur_eval->u.eval.close_paren &&
5686 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5689 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
5692 assert(0); /* NOTREACHED */
5696 /* a successful greedy match: now try to match B */
5697 if (cur_eval && cur_eval->u.eval.close_paren &&
5698 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5703 if (ST.c1 != CHRTEST_VOID)
5704 c = utf8_target ? utf8n_to_uvchr((U8*)locinput,
5705 UTF8_MAXBYTES, 0, uniflags)
5706 : (UV) UCHARAT(locinput);
5707 /* If it could work, try it. */
5708 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5709 CURLY_SETPAREN(ST.paren, ST.count);
5710 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
5711 assert(0); /* NOTREACHED */
5715 case CURLY_B_max_fail:
5716 /* failed to find B in a greedy match */
5718 REGCP_UNWIND(ST.cp);
5720 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5723 if (--ST.count < ST.min)
5725 locinput = HOPc(locinput, -1);
5726 goto curly_try_B_max;
5733 /* we've just finished A in /(??{A})B/; now continue with B */
5734 st->u.eval.toggle_reg_flags
5735 = cur_eval->u.eval.toggle_reg_flags;
5736 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5738 st->u.eval.prev_rex = rex_sv; /* inner */
5739 st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */
5740 rex_sv = cur_eval->u.eval.prev_rex;
5741 SET_reg_curpm(rex_sv);
5742 rex = (struct regexp *)SvANY(rex_sv);
5743 rexi = RXi_GET(rex);
5744 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5746 REGCP_SET(st->u.eval.lastcp);
5748 /* Restore parens of the outer rex without popping the
5750 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp);
5752 st->u.eval.prev_eval = cur_eval;
5753 cur_eval = cur_eval->u.eval.prev_eval;
5755 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5756 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5757 if ( nochange_depth )
5760 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
5761 locinput); /* match B */
5764 if (locinput < reginfo->till) {
5765 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5766 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5768 (long)(locinput - PL_reg_starttry),
5769 (long)(reginfo->till - PL_reg_starttry),
5772 sayNO_SILENT; /* Cannot match: too short. */
5774 sayYES; /* Success! */
5776 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5778 PerlIO_printf(Perl_debug_log,
5779 "%*s %ssubpattern success...%s\n",
5780 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5781 sayYES; /* Success! */
5784 #define ST st->u.ifmatch
5789 case SUSPEND: /* (?>A) */
5791 newstart = locinput;
5794 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5796 goto ifmatch_trivial_fail_test;
5798 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5800 ifmatch_trivial_fail_test:
5802 char * const s = HOPBACKc(locinput, scan->flags);
5807 sw = 1 - cBOOL(ST.wanted);
5811 next = scan + ARG(scan);
5819 newstart = locinput;
5823 ST.logical = logical;
5824 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5826 /* execute body of (?...A) */
5827 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
5828 assert(0); /* NOTREACHED */
5831 case IFMATCH_A_fail: /* body of (?...A) failed */
5832 ST.wanted = !ST.wanted;
5835 case IFMATCH_A: /* body of (?...A) succeeded */
5837 sw = cBOOL(ST.wanted);
5839 else if (!ST.wanted)
5842 if (OP(ST.me) != SUSPEND) {
5843 /* restore old position except for (?>...) */
5844 locinput = st->locinput;
5845 nextchr = UCHARAT(locinput);
5847 scan = ST.me + ARG(ST.me);
5850 continue; /* execute B */
5855 next = scan + ARG(scan);
5860 reginfo->cutpoint = PL_regeol;
5864 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5865 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
5866 assert(0); /* NOTREACHED */
5867 case COMMIT_next_fail:
5872 assert(0); /* NOTREACHED */
5874 #define ST st->u.mark
5876 ST.prev_mark = mark_state;
5877 ST.mark_name = sv_commit = sv_yes_mark
5878 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5880 ST.mark_loc = locinput;
5881 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
5882 assert(0); /* NOTREACHED */
5883 case MARKPOINT_next:
5884 mark_state = ST.prev_mark;
5886 assert(0); /* NOTREACHED */
5887 case MARKPOINT_next_fail:
5888 if (popmark && sv_eq(ST.mark_name,popmark))
5890 if (ST.mark_loc > startpoint)
5891 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5892 popmark = NULL; /* we found our mark */
5893 sv_commit = ST.mark_name;
5896 PerlIO_printf(Perl_debug_log,
5897 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5898 REPORT_CODE_OFF+depth*2, "",
5899 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5902 mark_state = ST.prev_mark;
5903 sv_yes_mark = mark_state ?
5904 mark_state->u.mark.mark_name : NULL;
5906 assert(0); /* NOTREACHED */
5909 /* (*SKIP) : if we fail we cut here*/
5910 ST.mark_name = NULL;
5911 ST.mark_loc = locinput;
5912 PUSH_STATE_GOTO(SKIP_next,next, locinput);
5914 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5915 otherwise do nothing. Meaning we need to scan
5917 regmatch_state *cur = mark_state;
5918 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5921 if ( sv_eq( cur->u.mark.mark_name,
5924 ST.mark_name = find;
5925 PUSH_STATE_GOTO( SKIP_next, next, locinput);
5927 cur = cur->u.mark.prev_mark;
5930 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5932 case SKIP_next_fail:
5934 /* (*CUT:NAME) - Set up to search for the name as we
5935 collapse the stack*/
5936 popmark = ST.mark_name;
5938 /* (*CUT) - No name, we cut here.*/
5939 if (ST.mark_loc > startpoint)
5940 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5941 /* but we set sv_commit to latest mark_name if there
5942 is one so they can test to see how things lead to this
5945 sv_commit=mark_state->u.mark.mark_name;
5949 assert(0); /* NOTREACHED */
5952 if ((n=is_LNBREAK(locinput,utf8_target))) {
5954 nextchr = UCHARAT(locinput);
5959 #define CASE_CLASS(nAmE) \
5961 if (locinput >= PL_regeol) \
5963 if ((n=is_##nAmE(locinput,utf8_target))) { \
5965 nextchr = UCHARAT(locinput); \
5970 if (locinput >= PL_regeol) \
5972 if ((n=is_##nAmE(locinput,utf8_target))) { \
5975 locinput += UTF8SKIP(locinput); \
5976 nextchr = UCHARAT(locinput); \
5981 CASE_CLASS(HORIZWS);
5985 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5986 PTR2UV(scan), OP(scan));
5987 Perl_croak(aTHX_ "regexp memory corruption");
5991 /* switch break jumps here */
5992 scan = next; /* prepare to execute the next op and ... */
5993 continue; /* ... jump back to the top, reusing st */
5994 assert(0); /* NOTREACHED */
5997 /* push a state that backtracks on success */
5998 st->u.yes.prev_yes_state = yes_state;
6002 /* push a new regex state, then continue at scan */
6004 regmatch_state *newst;
6007 regmatch_state *cur = st;
6008 regmatch_state *curyes = yes_state;
6010 regmatch_slab *slab = PL_regmatch_slab;
6011 for (;curd > -1;cur--,curd--) {
6012 if (cur < SLAB_FIRST(slab)) {
6014 cur = SLAB_LAST(slab);
6016 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6017 REPORT_CODE_OFF + 2 + depth * 2,"",
6018 curd, PL_reg_name[cur->resume_state],
6019 (curyes == cur) ? "yes" : ""
6022 curyes = cur->u.yes.prev_yes_state;
6025 DEBUG_STATE_pp("push")
6028 st->locinput = locinput;
6030 if (newst > SLAB_LAST(PL_regmatch_slab))
6031 newst = S_push_slab(aTHX);
6032 PL_regmatch_state = newst;
6034 locinput = pushinput;
6035 nextchr = UCHARAT(locinput);
6038 assert(0); /* NOTREACHED */
6043 * We get here only if there's trouble -- normally "case END" is
6044 * the terminating point.
6046 Perl_croak(aTHX_ "corrupted regexp pointers");
6052 /* we have successfully completed a subexpression, but we must now
6053 * pop to the state marked by yes_state and continue from there */
6054 assert(st != yes_state);
6056 while (st != yes_state) {
6058 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6059 PL_regmatch_slab = PL_regmatch_slab->prev;
6060 st = SLAB_LAST(PL_regmatch_slab);
6064 DEBUG_STATE_pp("pop (no final)");
6066 DEBUG_STATE_pp("pop (yes)");
6072 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6073 || yes_state > SLAB_LAST(PL_regmatch_slab))
6075 /* not in this slab, pop slab */
6076 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6077 PL_regmatch_slab = PL_regmatch_slab->prev;
6078 st = SLAB_LAST(PL_regmatch_slab);
6080 depth -= (st - yes_state);
6083 yes_state = st->u.yes.prev_yes_state;
6084 PL_regmatch_state = st;
6087 locinput= st->locinput;
6088 nextchr = UCHARAT(locinput);
6090 state_num = st->resume_state + no_final;
6091 goto reenter_switch;
6094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6095 PL_colors[4], PL_colors[5]));
6097 if (PL_reg_state.re_state_eval_setup_done) {
6098 /* each successfully executed (?{...}) block does the equivalent of
6099 * local $^R = do {...}
6100 * When popping the save stack, all these locals would be undone;
6101 * bypass this by setting the outermost saved $^R to the latest
6103 if (oreplsv != GvSV(PL_replgv))
6104 sv_setsv(oreplsv, GvSV(PL_replgv));
6111 PerlIO_printf(Perl_debug_log,
6112 "%*s %sfailed...%s\n",
6113 REPORT_CODE_OFF+depth*2, "",
6114 PL_colors[4], PL_colors[5])
6126 /* there's a previous state to backtrack to */
6128 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6129 PL_regmatch_slab = PL_regmatch_slab->prev;
6130 st = SLAB_LAST(PL_regmatch_slab);
6132 PL_regmatch_state = st;
6133 locinput= st->locinput;
6134 nextchr = UCHARAT(locinput);
6136 DEBUG_STATE_pp("pop");
6138 if (yes_state == st)
6139 yes_state = st->u.yes.prev_yes_state;
6141 state_num = st->resume_state + 1; /* failure = success + 1 */
6142 goto reenter_switch;
6147 if (rex->intflags & PREGf_VERBARG_SEEN) {
6148 SV *sv_err = get_sv("REGERROR", 1);
6149 SV *sv_mrk = get_sv("REGMARK", 1);
6151 sv_commit = &PL_sv_no;
6153 sv_yes_mark = &PL_sv_yes;
6156 sv_commit = &PL_sv_yes;
6157 sv_yes_mark = &PL_sv_no;
6159 sv_setsv(sv_err, sv_commit);
6160 sv_setsv(sv_mrk, sv_yes_mark);
6164 if (last_pushed_cv) {
6167 PERL_UNUSED_VAR(SP);
6170 /* clean up; in particular, free all slabs above current one */
6171 LEAVE_SCOPE(oldsave);
6173 assert(!result || locinput - PL_bostr >= 0);
6174 return result ? locinput - PL_bostr : -1;
6178 - regrepeat - repeatedly match something simple, report how many
6180 * startposp - pointer a pointer to the start position. This is updated
6181 * to point to the byte following the highest successful
6183 * p - the regnode to be repeatedly matched against.
6184 * max - maximum number of characters to match.
6185 * depth - (for debugging) backtracking depth.
6188 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth)
6193 char *loceol = PL_regeol;
6195 bool utf8_target = PL_reg_match_utf8;
6198 PERL_UNUSED_ARG(depth);
6201 PERL_ARGS_ASSERT_REGREPEAT;
6204 if (max == REG_INFTY)
6206 else if (max < loceol - scan)
6207 loceol = scan + max;
6212 while (scan < loceol && hardcount < max && *scan != '\n') {
6213 scan += UTF8SKIP(scan);
6217 while (scan < loceol && *scan != '\n')
6224 while (scan < loceol && hardcount < max) {
6225 scan += UTF8SKIP(scan);
6236 /* To get here, EXACTish nodes must have *byte* length == 1. That
6237 * means they match only characters in the string that can be expressed
6238 * as a single byte. For non-utf8 strings, that means a simple match.
6239 * For utf8 strings, the character matched must be an invariant, or
6240 * downgradable to a single byte. The pattern's utf8ness is
6241 * irrelevant, as since it's a single byte, it either isn't utf8, or if
6242 * it is, it's an invariant */
6245 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6247 if (! utf8_target || UNI_IS_INVARIANT(c)) {
6248 while (scan < loceol && UCHARAT(scan) == c) {
6254 /* Here, the string is utf8, and the pattern char is different
6255 * in utf8 than not, so can't compare them directly. Outside the
6256 * loop, find the two utf8 bytes that represent c, and then
6257 * look for those in sequence in the utf8 string */
6258 U8 high = UTF8_TWO_BYTE_HI(c);
6259 U8 low = UTF8_TWO_BYTE_LO(c);
6262 while (hardcount < max
6263 && scan + 1 < loceol
6264 && UCHARAT(scan) == high
6265 && UCHARAT(scan + 1) == low)
6273 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6277 PL_reg_flags |= RF_tainted;
6278 utf8_flags = FOLDEQ_UTF8_LOCALE;
6286 case EXACTFU_TRICKYFOLD:
6288 utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
6290 /* The comments for the EXACT case above apply as well to these fold
6295 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
6297 if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */
6298 char *tmpeol = loceol;
6299 while (hardcount < max
6300 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
6301 STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags))
6308 /* XXX Note that the above handles properly the German sharp s in
6309 * the pattern matching ss in the string. But it doesn't handle
6310 * properly cases where the string contains say 'LIGATURE ff' and
6311 * the pattern is 'f+'. This would require, say, a new function or
6312 * revised interface to foldEQ_utf8(), in which the maximum number
6313 * of characters to match could be passed and it would return how
6314 * many actually did. This is just one of many cases where
6315 * multi-char folds don't work properly, and so the fix is being
6321 /* Here, the string isn't utf8 and c is a single byte; and either
6322 * the pattern isn't utf8 or c is an invariant, so its utf8ness
6323 * doesn't affect c. Can just do simple comparisons for exact or
6326 case EXACTF: folded = PL_fold[c]; break;
6328 case EXACTFU_TRICKYFOLD:
6329 case EXACTFU: folded = PL_fold_latin1[c]; break;
6330 case EXACTFL: folded = PL_fold_locale[c]; break;
6331 default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
6333 while (scan < loceol &&
6334 (UCHARAT(scan) == c || UCHARAT(scan) == folded))
6342 if (utf8_target || OP(p) == ANYOFV) {
6345 inclasslen = loceol - scan;
6346 while (hardcount < max
6347 && ((inclasslen = loceol - scan) > 0)
6348 && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target))
6354 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
6362 LOAD_UTF8_CHARCLASS_ALNUM();
6363 while (hardcount < max && scan < loceol &&
6364 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6366 scan += UTF8SKIP(scan);
6370 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
6378 while (scan < loceol && isALNUM((U8) *scan)) {
6383 while (scan < loceol && isWORDCHAR_A((U8) *scan)) {
6388 PL_reg_flags |= RF_tainted;
6391 while (hardcount < max && scan < loceol &&
6392 isALNUM_LC_utf8((U8*)scan)) {
6393 scan += UTF8SKIP(scan);
6397 while (scan < loceol && isALNUM_LC(*scan))
6407 LOAD_UTF8_CHARCLASS_ALNUM();
6408 while (hardcount < max && scan < loceol &&
6409 ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
6411 scan += UTF8SKIP(scan);
6415 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
6422 goto utf8_Nwordchar;
6423 while (scan < loceol && ! isALNUM((U8) *scan)) {
6429 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
6435 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6436 scan += UTF8SKIP(scan);
6440 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
6447 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6448 scan += UTF8SKIP(scan);
6452 while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) {
6458 PL_reg_flags |= RF_tainted;
6461 while (hardcount < max && scan < loceol &&
6462 !isALNUM_LC_utf8((U8*)scan)) {
6463 scan += UTF8SKIP(scan);
6467 while (scan < loceol && !isALNUM_LC(*scan))
6477 LOAD_UTF8_CHARCLASS_SPACE();
6478 while (hardcount < max && scan < loceol &&
6480 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6482 scan += UTF8SKIP(scan);
6488 while (scan < loceol && isSPACE_L1((U8) *scan)) {
6497 while (scan < loceol && isSPACE((U8) *scan)) {
6502 while (scan < loceol && isSPACE_A((U8) *scan)) {
6507 PL_reg_flags |= RF_tainted;
6510 while (hardcount < max && scan < loceol &&
6511 isSPACE_LC_utf8((U8*)scan)) {
6512 scan += UTF8SKIP(scan);
6516 while (scan < loceol && isSPACE_LC(*scan))
6526 LOAD_UTF8_CHARCLASS_SPACE();
6527 while (hardcount < max && scan < loceol &&
6529 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
6531 scan += UTF8SKIP(scan);
6537 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
6546 while (scan < loceol && ! isSPACE((U8) *scan)) {
6552 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6553 scan += UTF8SKIP(scan);
6557 while (scan < loceol && ! isSPACE_A((U8) *scan)) {
6563 PL_reg_flags |= RF_tainted;
6566 while (hardcount < max && scan < loceol &&
6567 !isSPACE_LC_utf8((U8*)scan)) {
6568 scan += UTF8SKIP(scan);
6572 while (scan < loceol && !isSPACE_LC(*scan))
6579 LOAD_UTF8_CHARCLASS_DIGIT();
6580 while (hardcount < max && scan < loceol &&
6581 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6582 scan += UTF8SKIP(scan);
6586 while (scan < loceol && isDIGIT(*scan))
6591 while (scan < loceol && isDIGIT_A((U8) *scan)) {
6596 PL_reg_flags |= RF_tainted;
6599 while (hardcount < max && scan < loceol &&
6600 isDIGIT_LC_utf8((U8*)scan)) {
6601 scan += UTF8SKIP(scan);
6605 while (scan < loceol && isDIGIT_LC(*scan))
6612 LOAD_UTF8_CHARCLASS_DIGIT();
6613 while (hardcount < max && scan < loceol &&
6614 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6615 scan += UTF8SKIP(scan);
6619 while (scan < loceol && !isDIGIT(*scan))
6625 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6626 scan += UTF8SKIP(scan);
6630 while (scan < loceol && ! isDIGIT_A((U8) *scan)) {
6636 PL_reg_flags |= RF_tainted;
6639 while (hardcount < max && scan < loceol &&
6640 !isDIGIT_LC_utf8((U8*)scan)) {
6641 scan += UTF8SKIP(scan);
6645 while (scan < loceol && !isDIGIT_LC(*scan))
6652 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6658 LNBREAK can match two latin chars, which is ok,
6659 because we have a null terminated string, but we
6660 have to use hardcount in this situation
6662 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6671 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6676 while (scan < loceol && is_HORIZWS_latin1(scan))
6683 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6684 scan += UTF8SKIP(scan);
6688 while (scan < loceol && !is_HORIZWS_latin1(scan))
6696 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6701 while (scan < loceol && is_VERTWS_latin1(scan))
6709 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6710 scan += UTF8SKIP(scan);
6714 while (scan < loceol && !is_VERTWS_latin1(scan))
6720 default: /* Called on something of 0 width. */
6721 break; /* So match right here or not at all. */
6727 c = scan - *startposp;
6731 GET_RE_DEBUG_FLAGS_DECL;
6733 SV * const prop = sv_newmortal();
6734 regprop(prog, prop, p);
6735 PerlIO_printf(Perl_debug_log,
6736 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
6737 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6745 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6747 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
6748 create a copy so that changes the caller makes won't change the shared one
6751 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6753 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6754 return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp));
6759 S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6761 /* Returns the swash for the input 'node' in the regex 'prog'.
6762 * If <doinit> is true, will attempt to create the swash if not already
6764 * If <listsvp> is non-null, will return the swash initialization string in
6766 * If <altsvp> is non-null, will return the alternates to the regular swash
6768 * Tied intimately to how regcomp.c sets up the data structure */
6776 RXi_GET_DECL(prog,progi);
6777 const struct reg_data * const data = prog ? progi->data : NULL;
6779 PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH;
6781 assert(ANYOF_NONBITMAP(node));
6783 if (data && data->count) {
6784 const U32 n = ARG(node);
6786 if (data->what[n] == 's') {
6787 SV * const rv = MUTABLE_SV(data->data[n]);
6788 AV * const av = MUTABLE_AV(SvRV(rv));
6789 SV **const ary = AvARRAY(av);
6790 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
6792 si = *ary; /* ary[0] = the string to initialize the swash with */
6794 /* Elements 3 and 4 are either both present or both absent. [3] is
6795 * any inversion list generated at compile time; [4] indicates if
6796 * that inversion list has any user-defined properties in it. */
6797 if (av_len(av) >= 3) {
6800 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
6807 /* Element [1] is reserved for the set-up swash. If already there,
6808 * return it; if not, create it and store it there */
6809 if (SvROK(ary[1])) {
6812 else if (si && doinit) {
6814 sw = _core_swash_init("utf8", /* the utf8 package */
6818 0, /* not from tr/// */
6821 (void)av_store(av, 1, sw);
6824 /* Element [2] is for any multi-char folds. Note that is a
6825 * fundamentally flawed design, because can't backtrack and try
6826 * again. See [perl #89774] */
6827 if (SvTYPE(ary[2]) == SVt_PVAV) {
6834 SV* matches_string = newSVpvn("", 0);
6836 /* Use the swash, if any, which has to have incorporated into it all
6838 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
6839 && (si && si != &PL_sv_undef))
6842 /* If no swash, use the input initialization string, if available */
6843 sv_catsv(matches_string, si);
6846 /* Add the inversion list to whatever we have. This may have come from
6847 * the swash, or from an input parameter */
6849 sv_catsv(matches_string, _invlist_contents(invlist));
6851 *listsvp = matches_string;
6861 - reginclass - determine if a character falls into a character class
6863 n is the ANYOF regnode
6864 p is the target string
6865 lenp is pointer to the maximum number of bytes of how far to go in p
6866 (This is assumed wthout checking to always be at least the current
6868 utf8_target tells whether p is in UTF-8.
6870 Returns true if matched; false otherwise. If lenp is not NULL, on return
6871 from a successful match, the value it points to will be updated to how many
6872 bytes in p were matched. If there was no match, the value is undefined,
6873 possibly changed from the input.
6875 Note that this can be a synthetic start class, a combination of various
6876 nodes, so things you think might be mutually exclusive, such as locale,
6877 aren't. It can match both locale and non-locale
6882 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6885 const char flags = ANYOF_FLAGS(n);
6891 PERL_ARGS_ASSERT_REGINCLASS;
6893 /* If c is not already the code point, get it */
6894 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6895 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6896 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6897 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6898 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6899 * UTF8_ALLOW_FFFF */
6900 if (c_len == (STRLEN)-1)
6901 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6907 /* Use passed in max length, or one character if none passed in or less
6908 * than one character. And assume will match just one character. This is
6909 * overwritten later if matched more. */
6911 maxlen = (*lenp > c_len) ? *lenp : c_len;
6919 /* If this character is potentially in the bitmap, check it */
6921 if (ANYOF_BITMAP_TEST(n, c))
6923 else if (flags & ANYOF_NON_UTF8_LATIN1_ALL
6930 else if (flags & ANYOF_LOCALE) {
6931 PL_reg_flags |= RF_tainted;
6933 if ((flags & ANYOF_LOC_NONBITMAP_FOLD)
6934 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
6938 else if (ANYOF_CLASS_TEST_ANY_SET(n) &&
6939 ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6940 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6941 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6942 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6943 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6944 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6945 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6946 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6947 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6948 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6949 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) ||
6950 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) ||
6951 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6952 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6953 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6954 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6955 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6956 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6957 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6958 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6959 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6960 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6961 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6962 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6963 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6964 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6965 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6966 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6967 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) ||
6968 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c))
6969 ) /* How's that for a conditional? */
6976 /* If the bitmap didn't (or couldn't) match, and something outside the
6977 * bitmap could match, try that. Locale nodes specifiy completely the
6978 * behavior of code points in the bit map (otherwise, a utf8 target would
6979 * cause them to be treated as Unicode and not locale), except in
6980 * the very unlikely event when this node is a synthetic start class, which
6981 * could be a combination of locale and non-locale nodes. So allow locale
6982 * to match for the synthetic start class, which will give a false
6983 * positive that will be resolved when the match is done again as not part
6984 * of the synthetic start class */
6986 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6987 match = TRUE; /* Everything above 255 matches */
6989 else if (ANYOF_NONBITMAP(n)
6990 && ((flags & ANYOF_NONBITMAP_NON_UTF8)
6993 || (! (flags & ANYOF_LOCALE))
6994 || (flags & ANYOF_IS_SYNTHETIC)))))
6997 SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av);
7005 /* Not utf8. Convert as much of the string as available up
7006 * to the limit of how far the (single) character in the
7007 * pattern can possibly match (no need to go further). If
7008 * the node is a straight ANYOF or not folding, it can't
7009 * match more than one. Otherwise, It can match up to how
7010 * far a single char can fold to. Since not utf8, each
7011 * character is a single byte, so the max it can be in
7012 * bytes is the same as the max it can be in characters */
7013 STRLEN len = (OP(n) == ANYOF
7014 || ! (flags & ANYOF_LOC_NONBITMAP_FOLD))
7016 : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND)
7018 : UTF8_MAX_FOLD_CHAR_EXPAND;
7019 utf8_p = bytes_to_utf8(p, &len);
7022 if (swash_fetch(sw, utf8_p, TRUE))
7024 else if (flags & ANYOF_LOC_NONBITMAP_FOLD) {
7026 /* Here, we need to test if the fold of the target string
7027 * matches. The non-multi char folds have all been moved to
7028 * the compilation phase, and the multi-char folds have
7029 * been stored by regcomp into 'av'; we linearly check to
7030 * see if any match the target string (folded). We know
7031 * that the originals were each one character, but we don't
7032 * currently know how many characters/bytes each folded to,
7033 * except we do know that there are small limits imposed by
7034 * Unicode. XXX A performance enhancement would be to have
7035 * regcomp.c store the max number of chars/bytes that are
7036 * in an av entry, as, say the 0th element. Even better
7037 * would be to have a hash of the few characters that can
7038 * start a multi-char fold to the max number of chars of
7041 * If there is a match, we will need to advance (if lenp is
7042 * specified) the match pointer in the target string. But
7043 * what we are comparing here isn't that string directly,
7044 * but its fold, whose length may differ from the original.
7045 * As we go along in constructing the fold, therefore, we
7046 * create a map so that we know how many bytes in the
7047 * source to advance given that we have matched a certain
7048 * number of bytes in the fold. This map is stored in
7049 * 'map_fold_len_back'. Let n mean the number of bytes in
7050 * the fold of the first character that we are folding.
7051 * Then map_fold_len_back[n] is set to the number of bytes
7052 * in that first character. Similarly let m be the
7053 * corresponding number for the second character to be
7054 * folded. Then map_fold_len_back[n+m] is set to the
7055 * number of bytes occupied by the first two source
7056 * characters. ... */
7057 U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 };
7058 U8 folded[UTF8_MAXBYTES_CASE+1];
7059 STRLEN foldlen = 0; /* num bytes in fold of 1st char */
7060 STRLEN total_foldlen = 0; /* num bytes in fold of all
7063 if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) {
7065 /* Here, only need to fold the first char of the target
7066 * string. It the source wasn't utf8, is 1 byte long */
7067 to_utf8_fold(utf8_p, folded, &foldlen);
7068 total_foldlen = foldlen;
7069 map_fold_len_back[foldlen] = (utf8_target)
7075 /* Here, need to fold more than the first char. Do so
7076 * up to the limits */
7077 U8* source_ptr = utf8_p; /* The source for the fold
7080 U8* folded_ptr = folded;
7081 U8* e = utf8_p + maxlen; /* Can't go beyond last
7082 available byte in the
7086 i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e;
7090 /* Fold the next character */
7091 U8 this_char_folded[UTF8_MAXBYTES_CASE+1];
7092 STRLEN this_char_foldlen;
7093 to_utf8_fold(source_ptr,
7095 &this_char_foldlen);
7097 /* Bail if it would exceed the byte limit for
7098 * folding a single char. */
7099 if (this_char_foldlen + folded_ptr - folded >
7105 /* Add the fold of this character */
7106 Copy(this_char_folded,
7110 source_ptr += UTF8SKIP(source_ptr);
7111 folded_ptr += this_char_foldlen;
7112 total_foldlen = folded_ptr - folded;
7114 /* Create map from the number of bytes in the fold
7115 * back to the number of bytes in the source. If
7116 * the source isn't utf8, the byte count is just
7117 * the number of characters so far */
7118 map_fold_len_back[total_foldlen]
7120 ? source_ptr - utf8_p
7127 /* Do the linear search to see if the fold is in the list
7128 * of multi-char folds. */
7131 for (i = 0; i <= av_len(av); i++) {
7132 SV* const sv = *av_fetch(av, i, FALSE);
7134 const char * const s = SvPV_const(sv, len);
7136 if (len <= total_foldlen
7137 && memEQ(s, (char*)folded, len)
7139 /* If 0, means matched a partial char. See
7141 && map_fold_len_back[len])
7144 /* Advance the target string ptr to account for
7145 * this fold, but have to translate from the
7146 * folded length to the corresponding source
7149 *lenp = map_fold_len_back[len];
7158 /* If we allocated a string above, free it */
7159 if (! utf8_target) Safefree(utf8_p);
7163 if (UNICODE_IS_SUPER(c)
7164 && (flags & ANYOF_WARN_SUPER)
7165 && ckWARN_d(WARN_NON_UNICODE))
7167 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7168 "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c);
7172 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7173 return cBOOL(flags & ANYOF_INVERT) ^ match;
7177 S_reghop3(U8 *s, I32 off, const U8* lim)
7179 /* return the position 'off' UTF-8 characters away from 's', forward if
7180 * 'off' >= 0, backwards if negative. But don't go outside of position
7181 * 'lim', which better be < s if off < 0 */
7185 PERL_ARGS_ASSERT_REGHOP3;
7188 while (off-- && s < lim) {
7189 /* XXX could check well-formedness here */
7194 while (off++ && s > lim) {
7196 if (UTF8_IS_CONTINUED(*s)) {
7197 while (s > lim && UTF8_IS_CONTINUATION(*s))
7200 /* XXX could check well-formedness here */
7207 /* there are a bunch of places where we use two reghop3's that should
7208 be replaced with this routine. but since thats not done yet
7209 we ifdef it out - dmq
7212 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
7216 PERL_ARGS_ASSERT_REGHOP4;
7219 while (off-- && s < rlim) {
7220 /* XXX could check well-formedness here */
7225 while (off++ && s > llim) {
7227 if (UTF8_IS_CONTINUED(*s)) {
7228 while (s > llim && UTF8_IS_CONTINUATION(*s))
7231 /* XXX could check well-formedness here */
7239 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
7243 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7246 while (off-- && s < lim) {
7247 /* XXX could check well-formedness here */
7254 while (off++ && s > lim) {
7256 if (UTF8_IS_CONTINUED(*s)) {
7257 while (s > lim && UTF8_IS_CONTINUATION(*s))
7260 /* XXX could check well-formedness here */
7269 restore_pos(pTHX_ void *arg)
7272 regexp * const rex = (regexp *)arg;
7273 if (PL_reg_state.re_state_eval_setup_done) {
7274 if (PL_reg_oldsaved) {
7275 rex->subbeg = PL_reg_oldsaved;
7276 rex->sublen = PL_reg_oldsavedlen;
7277 rex->suboffset = PL_reg_oldsavedoffset;
7278 rex->subcoffset = PL_reg_oldsavedcoffset;
7279 #ifdef PERL_OLD_COPY_ON_WRITE
7280 rex->saved_copy = PL_nrs;
7282 RXp_MATCH_COPIED_on(rex);
7284 PL_reg_magic->mg_len = PL_reg_oldpos;
7285 PL_reg_state.re_state_eval_setup_done = FALSE;
7286 PL_curpm = PL_reg_oldcurpm;
7291 S_to_utf8_substr(pTHX_ register regexp *prog)
7295 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
7298 if (prog->substrs->data[i].substr
7299 && !prog->substrs->data[i].utf8_substr) {
7300 SV* const sv = newSVsv(prog->substrs->data[i].substr);
7301 prog->substrs->data[i].utf8_substr = sv;
7302 sv_utf8_upgrade(sv);
7303 if (SvVALID(prog->substrs->data[i].substr)) {
7304 if (SvTAIL(prog->substrs->data[i].substr)) {
7305 /* Trim the trailing \n that fbm_compile added last
7307 SvCUR_set(sv, SvCUR(sv) - 1);
7308 /* Whilst this makes the SV technically "invalid" (as its
7309 buffer is no longer followed by "\0") when fbm_compile()
7310 adds the "\n" back, a "\0" is restored. */
7311 fbm_compile(sv, FBMcf_TAIL);
7315 if (prog->substrs->data[i].substr == prog->check_substr)
7316 prog->check_utf8 = sv;
7322 S_to_byte_substr(pTHX_ register regexp *prog)
7327 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
7330 if (prog->substrs->data[i].utf8_substr
7331 && !prog->substrs->data[i].substr) {
7332 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
7333 if (sv_utf8_downgrade(sv, TRUE)) {
7334 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
7335 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
7336 /* Trim the trailing \n that fbm_compile added last
7338 SvCUR_set(sv, SvCUR(sv) - 1);
7339 fbm_compile(sv, FBMcf_TAIL);
7347 prog->substrs->data[i].substr = sv;
7348 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
7349 prog->check_substr = sv;
7354 /* These constants are for finding GCB=LV and GCB=LVT. These are for the
7355 * pre-composed Hangul syllables, which are all in a contiguous block and
7356 * arranged there in such a way so as to facilitate alorithmic determination of
7357 * their characteristics. As such, they don't need a swash, but can be
7358 * determined by simple arithmetic. Almost all are GCB=LVT, but every 28th one
7360 #define SBASE 0xAC00 /* Start of block */
7361 #define SCount 11172 /* Length of block */
7364 #if 0 /* This routine is not currently used */
7365 PERL_STATIC_INLINE bool
7366 S_is_utf8_X_LV(pTHX_ const U8 *p)
7368 /* Unlike most other similarly named routines here, this does not create a
7369 * swash, so swash_fetch() cannot be used on PL_utf8_X_LV. */
7373 UV cp = valid_utf8_to_uvchr(p, NULL);
7375 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
7377 /* The earliest Unicode releases did not have these precomposed Hangul
7378 * syllables. Set to point to undef in that case, so will return false on
7380 if (! PL_utf8_X_LV) { /* Set up if this is the first time called */
7381 PL_utf8_X_LV = swash_init("utf8", "_X_GCB_LV", &PL_sv_undef, 1, 0);
7382 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LV)) == 0) {
7383 SvREFCNT_dec(PL_utf8_X_LV);
7384 PL_utf8_X_LV = &PL_sv_undef;
7388 return (PL_utf8_X_LV != &PL_sv_undef
7389 && cp >= SBASE && cp < SBASE + SCount
7390 && (cp - SBASE) % TCount == 0); /* Only every TCount one is LV */
7394 PERL_STATIC_INLINE bool
7395 S_is_utf8_X_LVT(pTHX_ const U8 *p)
7397 /* Unlike most other similarly named routines here, this does not create a
7398 * swash, so swash_fetch() cannot be used on PL_utf8_X_LVT. */
7402 UV cp = valid_utf8_to_uvchr(p, NULL);
7404 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
7406 /* The earliest Unicode releases did not have these precomposed Hangul
7407 * syllables. Set to point to undef in that case, so will return false on
7409 if (! PL_utf8_X_LVT) { /* Set up if this is the first time called */
7410 PL_utf8_X_LVT = swash_init("utf8", "_X_GCB_LVT", &PL_sv_undef, 1, 0);
7411 if (_invlist_len(_get_swash_invlist(PL_utf8_X_LVT)) == 0) {
7412 SvREFCNT_dec(PL_utf8_X_LVT);
7413 PL_utf8_X_LVT = &PL_sv_undef;
7417 return (PL_utf8_X_LVT != &PL_sv_undef
7418 && cp >= SBASE && cp < SBASE + SCount
7419 && (cp - SBASE) % TCount != 0); /* All but every TCount one is LV */
7424 * c-indentation-style: bsd
7426 * indent-tabs-mode: nil
7429 * ex: set ts=8 sts=4 sw=4 et: