5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
78 #ifdef PERL_IN_XSUB_RE
84 #define RF_tainted 1 /* tainted information used? */
85 #define RF_warned 2 /* warned about big count? */
87 #define RF_utf8 8 /* Pattern contains multibyte chars? */
89 #define UTF ((PL_reg_flags & RF_utf8) != 0)
91 #define RS_init 1 /* eval environment created */
92 #define RS_set 2 /* replsv value is set */
98 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
104 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
107 #define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
111 #define HOPBACKc(pos, off) \
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
121 /* these are unrolled below in the CCC_TRY_XXX defined */
122 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
123 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
124 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
125 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
126 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
127 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
131 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
132 so that it is possible to override the option here without having to
133 rebuild the entire core. as we are required to do if we change regcomp.h
134 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
136 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
137 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
140 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
141 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
142 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
143 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
144 #define RE_utf8_perl_word PL_utf8_alnum
145 #define RE_utf8_perl_space PL_utf8_space
146 #define RE_utf8_posix_digit PL_utf8_digit
147 #define perl_word alnum
148 #define perl_space space
149 #define posix_digit digit
151 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
152 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
153 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
154 #define RE_utf8_perl_word PL_utf8_perl_word
155 #define RE_utf8_perl_space PL_utf8_perl_space
156 #define RE_utf8_posix_digit PL_utf8_posix_digit
160 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
162 PL_reg_flags |= RF_tainted; \
167 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
168 if (!CAT2(PL_utf8_,CLASS)) { \
172 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
176 if (!(OP(scan) == NAME \
177 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
178 : LCFUNC_utf8((U8*)locinput))) \
182 locinput += PL_utf8skip[nextchr]; \
183 nextchr = UCHARAT(locinput); \
186 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
188 nextchr = UCHARAT(++locinput); \
191 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
193 PL_reg_flags |= RF_tainted; \
196 if (!nextchr && locinput >= PL_regeol) \
198 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
199 if (!CAT2(PL_utf8_,CLASS)) { \
203 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
207 if ((OP(scan) == NAME \
208 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
209 : LCFUNC_utf8((U8*)locinput))) \
213 locinput += PL_utf8skip[nextchr]; \
214 nextchr = UCHARAT(locinput); \
217 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
219 nextchr = UCHARAT(++locinput); \
226 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
228 /* for use after a quantifier and before an EXACT-like node -- japhy */
229 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
230 #define JUMPABLE(rn) ( \
232 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
234 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
235 OP(rn) == PLUS || OP(rn) == MINMOD || \
236 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
237 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
239 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
241 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
244 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
245 we don't need this definition. */
246 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
247 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
248 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
251 /* ... so we use this as its faster. */
252 #define IS_TEXT(rn) ( OP(rn)==EXACT )
253 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
254 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
259 Search for mandatory following text node; for lookahead, the text must
260 follow but for lookbehind (rn->flags != 0) we skip to the next step.
262 #define FIND_NEXT_IMPT(rn) STMT_START { \
263 while (JUMPABLE(rn)) { \
264 const OPCODE type = OP(rn); \
265 if (type == SUSPEND || PL_regkind[type] == CURLY) \
266 rn = NEXTOPER(NEXTOPER(rn)); \
267 else if (type == PLUS) \
269 else if (type == IFMATCH) \
270 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
271 else rn += NEXT_OFF(rn); \
276 static void restore_pos(pTHX_ void *arg);
279 S_regcppush(pTHX_ I32 parenfloor)
282 const int retval = PL_savestack_ix;
283 #define REGCP_PAREN_ELEMS 4
284 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
286 GET_RE_DEBUG_FLAGS_DECL;
288 if (paren_elems_to_push < 0)
289 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
291 #define REGCP_OTHER_ELEMS 7
292 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
294 for (p = PL_regsize; p > parenfloor; p--) {
295 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
296 SSPUSHINT(PL_regoffs[p].end);
297 SSPUSHINT(PL_regoffs[p].start);
298 SSPUSHPTR(PL_reg_start_tmp[p]);
300 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
301 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
302 (UV)p, (IV)PL_regoffs[p].start,
303 (IV)(PL_reg_start_tmp[p] - PL_bostr),
304 (IV)PL_regoffs[p].end
307 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
308 SSPUSHPTR(PL_regoffs);
309 SSPUSHINT(PL_regsize);
310 SSPUSHINT(*PL_reglastparen);
311 SSPUSHINT(*PL_reglastcloseparen);
312 SSPUSHPTR(PL_reginput);
313 #define REGCP_FRAME_ELEMS 2
314 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
315 * are needed for the regexp context stack bookkeeping. */
316 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
317 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
322 /* These are needed since we do not localize EVAL nodes: */
323 #define REGCP_SET(cp) \
325 PerlIO_printf(Perl_debug_log, \
326 " Setting an EVAL scope, savestack=%"IVdf"\n", \
327 (IV)PL_savestack_ix)); \
330 #define REGCP_UNWIND(cp) \
332 if (cp != PL_savestack_ix) \
333 PerlIO_printf(Perl_debug_log, \
334 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
335 (IV)(cp), (IV)PL_savestack_ix)); \
339 S_regcppop(pTHX_ const regexp *rex)
344 GET_RE_DEBUG_FLAGS_DECL;
346 PERL_ARGS_ASSERT_REGCPPOP;
348 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
350 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
351 i = SSPOPINT; /* Parentheses elements to pop. */
352 input = (char *) SSPOPPTR;
353 *PL_reglastcloseparen = SSPOPINT;
354 *PL_reglastparen = SSPOPINT;
355 PL_regsize = SSPOPINT;
356 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
359 /* Now restore the parentheses context. */
360 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
361 i > 0; i -= REGCP_PAREN_ELEMS) {
363 U32 paren = (U32)SSPOPINT;
364 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
365 PL_regoffs[paren].start = SSPOPINT;
367 if (paren <= *PL_reglastparen)
368 PL_regoffs[paren].end = tmps;
370 PerlIO_printf(Perl_debug_log,
371 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
372 (UV)paren, (IV)PL_regoffs[paren].start,
373 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
374 (IV)PL_regoffs[paren].end,
375 (paren > *PL_reglastparen ? "(no)" : ""));
379 if (*PL_reglastparen + 1 <= rex->nparens) {
380 PerlIO_printf(Perl_debug_log,
381 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
382 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
386 /* It would seem that the similar code in regtry()
387 * already takes care of this, and in fact it is in
388 * a better location to since this code can #if 0-ed out
389 * but the code in regtry() is needed or otherwise tests
390 * requiring null fields (pat.t#187 and split.t#{13,14}
391 * (as of patchlevel 7877) will fail. Then again,
392 * this code seems to be necessary or otherwise
393 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
394 * --jhi updated by dapm */
395 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
397 PL_regoffs[i].start = -1;
398 PL_regoffs[i].end = -1;
404 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
407 * pregexec and friends
410 #ifndef PERL_IN_XSUB_RE
412 - pregexec - match a regexp against a string
415 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
416 char *strbeg, I32 minend, SV *screamer, U32 nosave)
417 /* strend: pointer to null at end of string */
418 /* strbeg: real beginning of string */
419 /* minend: end of match must be >=minend after stringarg. */
420 /* nosave: For optimizations. */
422 PERL_ARGS_ASSERT_PREGEXEC;
425 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
426 nosave ? 0 : REXEC_COPY_STR);
431 * Need to implement the following flags for reg_anch:
433 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
435 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
436 * INTUIT_AUTORITATIVE_ML
437 * INTUIT_ONCE_NOML - Intuit can match in one location only.
440 * Another flag for this function: SECOND_TIME (so that float substrs
441 * with giant delta may be not rechecked).
444 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
446 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
447 Otherwise, only SvCUR(sv) is used to get strbeg. */
449 /* XXXX We assume that strpos is strbeg unless sv. */
451 /* XXXX Some places assume that there is a fixed substring.
452 An update may be needed if optimizer marks as "INTUITable"
453 RExen without fixed substrings. Similarly, it is assumed that
454 lengths of all the strings are no more than minlen, thus they
455 cannot come from lookahead.
456 (Or minlen should take into account lookahead.)
457 NOTE: Some of this comment is not correct. minlen does now take account
458 of lookahead/behind. Further research is required. -- demerphq
462 /* A failure to find a constant substring means that there is no need to make
463 an expensive call to REx engine, thus we celebrate a failure. Similarly,
464 finding a substring too deep into the string means that less calls to
465 regtry() should be needed.
467 REx compiler's optimizer found 4 possible hints:
468 a) Anchored substring;
470 c) Whether we are anchored (beginning-of-line or \G);
471 d) First node (of those at offset 0) which may distingush positions;
472 We use a)b)d) and multiline-part of c), and try to find a position in the
473 string which does not contradict any of them.
476 /* Most of decisions we do here should have been done at compile time.
477 The nodes of the REx which we used for the search should have been
478 deleted from the finite automaton. */
481 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
482 char *strend, const U32 flags, re_scream_pos_data *data)
485 struct regexp *const prog = (struct regexp *)SvANY(rx);
486 register I32 start_shift = 0;
487 /* Should be nonnegative! */
488 register I32 end_shift = 0;
493 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
495 register char *other_last = NULL; /* other substr checked before this */
496 char *check_at = NULL; /* check substr found at this pos */
497 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
498 RXi_GET_DECL(prog,progi);
500 const char * const i_strpos = strpos;
502 GET_RE_DEBUG_FLAGS_DECL;
504 PERL_ARGS_ASSERT_RE_INTUIT_START;
506 RX_MATCH_UTF8_set(rx,do_utf8);
509 PL_reg_flags |= RF_utf8;
512 debug_start_match(rx, do_utf8, strpos, strend,
513 sv ? "Guessing start of match in sv for"
514 : "Guessing start of match in string for");
517 /* CHR_DIST() would be more correct here but it makes things slow. */
518 if (prog->minlen > strend - strpos) {
519 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
520 "String too short... [re_intuit_start]\n"));
524 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
527 if (!prog->check_utf8 && prog->check_substr)
528 to_utf8_substr(prog);
529 check = prog->check_utf8;
531 if (!prog->check_substr && prog->check_utf8)
532 to_byte_substr(prog);
533 check = prog->check_substr;
535 if (check == &PL_sv_undef) {
536 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
537 "Non-utf8 string cannot match utf8 check string\n"));
540 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
541 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
542 || ( (prog->extflags & RXf_ANCH_BOL)
543 && !multiline ) ); /* Check after \n? */
546 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
547 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
548 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
550 && (strpos != strbeg)) {
551 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
554 if (prog->check_offset_min == prog->check_offset_max &&
555 !(prog->extflags & RXf_CANY_SEEN)) {
556 /* Substring at constant offset from beg-of-str... */
559 s = HOP3c(strpos, prog->check_offset_min, strend);
562 slen = SvCUR(check); /* >= 1 */
564 if ( strend - s > slen || strend - s < slen - 1
565 || (strend - s == slen && strend[-1] != '\n')) {
566 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
569 /* Now should match s[0..slen-2] */
571 if (slen && (*SvPVX_const(check) != *s
573 && memNE(SvPVX_const(check), s, slen)))) {
575 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
579 else if (*SvPVX_const(check) != *s
580 || ((slen = SvCUR(check)) > 1
581 && memNE(SvPVX_const(check), s, slen)))
584 goto success_at_start;
587 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
589 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
590 end_shift = prog->check_end_shift;
593 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
594 - (SvTAIL(check) != 0);
595 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
597 if (end_shift < eshift)
601 else { /* Can match at random position */
604 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
605 end_shift = prog->check_end_shift;
607 /* end shift should be non negative here */
610 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
612 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
613 (IV)end_shift, RX_PRECOMP(prog));
617 /* Find a possible match in the region s..strend by looking for
618 the "check" substring in the region corrected by start/end_shift. */
621 I32 srch_start_shift = start_shift;
622 I32 srch_end_shift = end_shift;
623 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
624 srch_end_shift -= ((strbeg - s) - srch_start_shift);
625 srch_start_shift = strbeg - s;
627 DEBUG_OPTIMISE_MORE_r({
628 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
629 (IV)prog->check_offset_min,
630 (IV)srch_start_shift,
632 (IV)prog->check_end_shift);
635 if (flags & REXEC_SCREAM) {
636 I32 p = -1; /* Internal iterator of scream. */
637 I32 * const pp = data ? data->scream_pos : &p;
639 if (PL_screamfirst[BmRARE(check)] >= 0
640 || ( BmRARE(check) == '\n'
641 && (BmPREVIOUS(check) == SvCUR(check) - 1)
643 s = screaminstr(sv, check,
644 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
647 /* we may be pointing at the wrong string */
648 if (s && RXp_MATCH_COPIED(prog))
649 s = strbeg + (s - SvPVX_const(sv));
651 *data->scream_olds = s;
656 if (prog->extflags & RXf_CANY_SEEN) {
657 start_point= (U8*)(s + srch_start_shift);
658 end_point= (U8*)(strend - srch_end_shift);
660 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
661 end_point= HOP3(strend, -srch_end_shift, strbeg);
663 DEBUG_OPTIMISE_MORE_r({
664 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
665 (int)(end_point - start_point),
666 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
670 s = fbm_instr( start_point, end_point,
671 check, multiline ? FBMrf_MULTILINE : 0);
674 /* Update the count-of-usability, remove useless subpatterns,
678 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
679 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
680 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
681 (s ? "Found" : "Did not find"),
682 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
683 ? "anchored" : "floating"),
686 (s ? " at offset " : "...\n") );
691 /* Finish the diagnostic message */
692 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
694 /* XXX dmq: first branch is for positive lookbehind...
695 Our check string is offset from the beginning of the pattern.
696 So we need to do any stclass tests offset forward from that
705 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
706 Start with the other substr.
707 XXXX no SCREAM optimization yet - and a very coarse implementation
708 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
709 *always* match. Probably should be marked during compile...
710 Probably it is right to do no SCREAM here...
713 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
714 : (prog->float_substr && prog->anchored_substr))
716 /* Take into account the "other" substring. */
717 /* XXXX May be hopelessly wrong for UTF... */
720 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
723 char * const last = HOP3c(s, -start_shift, strbeg);
725 char * const saved_s = s;
728 t = s - prog->check_offset_max;
729 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
731 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
736 t = HOP3c(t, prog->anchored_offset, strend);
737 if (t < other_last) /* These positions already checked */
739 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
742 /* XXXX It is not documented what units *_offsets are in.
743 We assume bytes, but this is clearly wrong.
744 Meaning this code needs to be carefully reviewed for errors.
748 /* On end-of-str: see comment below. */
749 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
750 if (must == &PL_sv_undef) {
752 DEBUG_r(must = prog->anchored_utf8); /* for debug */
757 HOP3(HOP3(last1, prog->anchored_offset, strend)
758 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
760 multiline ? FBMrf_MULTILINE : 0
763 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
764 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
765 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
766 (s ? "Found" : "Contradicts"),
767 quoted, RE_SV_TAIL(must));
772 if (last1 >= last2) {
773 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
774 ", giving up...\n"));
777 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
778 ", trying floating at offset %ld...\n",
779 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
780 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
781 s = HOP3c(last, 1, strend);
785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
786 (long)(s - i_strpos)));
787 t = HOP3c(s, -prog->anchored_offset, strbeg);
788 other_last = HOP3c(s, 1, strend);
796 else { /* Take into account the floating substring. */
798 char * const saved_s = s;
801 t = HOP3c(s, -start_shift, strbeg);
803 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
804 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
805 last = HOP3c(t, prog->float_max_offset, strend);
806 s = HOP3c(t, prog->float_min_offset, strend);
809 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
810 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
811 /* fbm_instr() takes into account exact value of end-of-str
812 if the check is SvTAIL(ed). Since false positives are OK,
813 and end-of-str is not later than strend we are OK. */
814 if (must == &PL_sv_undef) {
816 DEBUG_r(must = prog->float_utf8); /* for debug message */
819 s = fbm_instr((unsigned char*)s,
820 (unsigned char*)last + SvCUR(must)
822 must, multiline ? FBMrf_MULTILINE : 0);
824 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
825 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
826 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
827 (s ? "Found" : "Contradicts"),
828 quoted, RE_SV_TAIL(must));
832 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
833 ", giving up...\n"));
836 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
837 ", trying anchored starting at offset %ld...\n",
838 (long)(saved_s + 1 - i_strpos)));
840 s = HOP3c(t, 1, strend);
844 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
845 (long)(s - i_strpos)));
846 other_last = s; /* Fix this later. --Hugo */
856 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
858 DEBUG_OPTIMISE_MORE_r(
859 PerlIO_printf(Perl_debug_log,
860 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
861 (IV)prog->check_offset_min,
862 (IV)prog->check_offset_max,
870 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
872 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
875 /* Fixed substring is found far enough so that the match
876 cannot start at strpos. */
878 if (ml_anch && t[-1] != '\n') {
879 /* Eventually fbm_*() should handle this, but often
880 anchored_offset is not 0, so this check will not be wasted. */
881 /* XXXX In the code below we prefer to look for "^" even in
882 presence of anchored substrings. And we search even
883 beyond the found float position. These pessimizations
884 are historical artefacts only. */
886 while (t < strend - prog->minlen) {
888 if (t < check_at - prog->check_offset_min) {
889 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
890 /* Since we moved from the found position,
891 we definitely contradict the found anchored
892 substr. Due to the above check we do not
893 contradict "check" substr.
894 Thus we can arrive here only if check substr
895 is float. Redo checking for "other"=="fixed".
898 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
899 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
900 goto do_other_anchored;
902 /* We don't contradict the found floating substring. */
903 /* XXXX Why not check for STCLASS? */
905 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
906 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
909 /* Position contradicts check-string */
910 /* XXXX probably better to look for check-string
911 than for "\n", so one should lower the limit for t? */
912 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
913 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
914 other_last = strpos = s = t + 1;
919 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
920 PL_colors[0], PL_colors[1]));
924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
925 PL_colors[0], PL_colors[1]));
929 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
932 /* The found string does not prohibit matching at strpos,
933 - no optimization of calling REx engine can be performed,
934 unless it was an MBOL and we are not after MBOL,
935 or a future STCLASS check will fail this. */
937 /* Even in this situation we may use MBOL flag if strpos is offset
938 wrt the start of the string. */
939 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
940 && (strpos != strbeg) && strpos[-1] != '\n'
941 /* May be due to an implicit anchor of m{.*foo} */
942 && !(prog->intflags & PREGf_IMPLICIT))
947 DEBUG_EXECUTE_r( if (ml_anch)
948 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
949 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
952 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
954 prog->check_utf8 /* Could be deleted already */
955 && --BmUSEFUL(prog->check_utf8) < 0
956 && (prog->check_utf8 == prog->float_utf8)
958 prog->check_substr /* Could be deleted already */
959 && --BmUSEFUL(prog->check_substr) < 0
960 && (prog->check_substr == prog->float_substr)
963 /* If flags & SOMETHING - do not do it many times on the same match */
964 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
965 /* XXX Does the destruction order has to change with do_utf8? */
966 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
967 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
968 prog->check_substr = prog->check_utf8 = NULL; /* disable */
969 prog->float_substr = prog->float_utf8 = NULL; /* clear */
970 check = NULL; /* abort */
972 /* XXXX This is a remnant of the old implementation. It
973 looks wasteful, since now INTUIT can use many
975 prog->extflags &= ~RXf_USE_INTUIT;
982 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
983 /* trie stclasses are too expensive to use here, we are better off to
984 leave it to regmatch itself */
985 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
986 /* minlen == 0 is possible if regstclass is \b or \B,
987 and the fixed substr is ''$.
988 Since minlen is already taken into account, s+1 is before strend;
989 accidentally, minlen >= 1 guaranties no false positives at s + 1
990 even for \b or \B. But (minlen? 1 : 0) below assumes that
991 regstclass does not come from lookahead... */
992 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
993 This leaves EXACTF only, which is dealt with in find_byclass(). */
994 const U8* const str = (U8*)STRING(progi->regstclass);
995 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
996 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
999 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1000 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1001 else if (prog->float_substr || prog->float_utf8)
1002 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1006 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1007 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1010 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1013 const char *what = NULL;
1015 if (endpos == strend) {
1016 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1017 "Could not match STCLASS...\n") );
1020 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1021 "This position contradicts STCLASS...\n") );
1022 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1024 /* Contradict one of substrings */
1025 if (prog->anchored_substr || prog->anchored_utf8) {
1026 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1027 DEBUG_EXECUTE_r( what = "anchored" );
1029 s = HOP3c(t, 1, strend);
1030 if (s + start_shift + end_shift > strend) {
1031 /* XXXX Should be taken into account earlier? */
1032 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1033 "Could not match STCLASS...\n") );
1038 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1039 "Looking for %s substr starting at offset %ld...\n",
1040 what, (long)(s + start_shift - i_strpos)) );
1043 /* Have both, check_string is floating */
1044 if (t + start_shift >= check_at) /* Contradicts floating=check */
1045 goto retry_floating_check;
1046 /* Recheck anchored substring, but not floating... */
1050 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1051 "Looking for anchored substr starting at offset %ld...\n",
1052 (long)(other_last - i_strpos)) );
1053 goto do_other_anchored;
1055 /* Another way we could have checked stclass at the
1056 current position only: */
1061 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1062 "Looking for /%s^%s/m starting at offset %ld...\n",
1063 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1066 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1068 /* Check is floating subtring. */
1069 retry_floating_check:
1070 t = check_at - start_shift;
1071 DEBUG_EXECUTE_r( what = "floating" );
1072 goto hop_and_restart;
1075 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1076 "By STCLASS: moving %ld --> %ld\n",
1077 (long)(t - i_strpos), (long)(s - i_strpos))
1081 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1082 "Does not contradict STCLASS...\n");
1087 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1088 PL_colors[4], (check ? "Guessed" : "Giving up"),
1089 PL_colors[5], (long)(s - i_strpos)) );
1092 fail_finish: /* Substring not found */
1093 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1094 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1097 PL_colors[4], PL_colors[5]));
1101 #define DECL_TRIE_TYPE(scan) \
1102 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1103 trie_type = (scan->flags != EXACT) \
1104 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1105 : (do_utf8 ? trie_utf8 : trie_plain)
1107 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1108 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1109 switch (trie_type) { \
1110 case trie_utf8_fold: \
1111 if ( foldlen>0 ) { \
1112 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1117 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1118 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1119 foldlen -= UNISKIP( uvc ); \
1120 uscan = foldbuf + UNISKIP( uvc ); \
1123 case trie_latin_utf8_fold: \
1124 if ( foldlen>0 ) { \
1125 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1131 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1132 foldlen -= UNISKIP( uvc ); \
1133 uscan = foldbuf + UNISKIP( uvc ); \
1137 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1144 charid = trie->charmap[ uvc ]; \
1148 if (widecharmap) { \
1149 SV** const svpp = hv_fetch(widecharmap, \
1150 (char*)&uvc, sizeof(UV), 0); \
1152 charid = (U16)SvIV(*svpp); \
1157 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1159 char *my_strend= (char *)strend; \
1162 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
1163 m, NULL, ln, (bool)UTF)) \
1164 && (!reginfo || regtry(reginfo, &s)) ) \
1167 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1168 uvchr_to_utf8(tmpbuf, c); \
1169 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1171 && (f == c1 || f == c2) \
1173 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1174 m, NULL, ln, (bool)UTF)) \
1175 && (!reginfo || regtry(reginfo, &s)) ) \
1181 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1185 && (ln == 1 || !(OP(c) == EXACTF \
1187 : ibcmp_locale(s, m, ln))) \
1188 && (!reginfo || regtry(reginfo, &s)) ) \
1194 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1196 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1202 #define REXEC_FBC_SCAN(CoDe) \
1204 while (s < strend) { \
1210 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1211 REXEC_FBC_UTF8_SCAN( \
1213 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1222 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1225 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1234 #define REXEC_FBC_TRYIT \
1235 if ((!reginfo || regtry(reginfo, &s))) \
1238 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1240 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1243 REXEC_FBC_CLASS_SCAN(CoNd); \
1247 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1250 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1253 REXEC_FBC_CLASS_SCAN(CoNd); \
1257 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1258 PL_reg_flags |= RF_tainted; \
1260 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1263 REXEC_FBC_CLASS_SCAN(CoNd); \
1267 #define DUMP_EXEC_POS(li,s,doutf8) \
1268 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1270 /* We know what class REx starts with. Try to find this position... */
1271 /* if reginfo is NULL, its a dryrun */
1272 /* annoyingly all the vars in this routine have different names from their counterparts
1273 in regmatch. /grrr */
1276 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1277 const char *strend, regmatch_info *reginfo)
1280 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1284 register STRLEN uskip;
1288 register I32 tmp = 1; /* Scratch variable? */
1289 register const bool do_utf8 = PL_reg_match_utf8;
1290 RXi_GET_DECL(prog,progi);
1292 PERL_ARGS_ASSERT_FIND_BYCLASS;
1294 /* We know what class it must start with. */
1298 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1299 !UTF8_IS_INVARIANT((U8)s[0]) ?
1300 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1301 REGINCLASS(prog, c, (U8*)s));
1304 while (s < strend) {
1307 if (REGINCLASS(prog, c, (U8*)s) ||
1308 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1309 /* The assignment of 2 is intentional:
1310 * for the folded sharp s, the skip is 2. */
1311 (skip = SHARP_S_SKIP))) {
1312 if (tmp && (!reginfo || regtry(reginfo, &s)))
1325 if (tmp && (!reginfo || regtry(reginfo, &s)))
1333 ln = STR_LEN(c); /* length to match in octets/bytes */
1334 lnc = (I32) ln; /* length to match in characters */
1336 STRLEN ulen1, ulen2;
1338 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1339 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1340 /* used by commented-out code below */
1341 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1343 /* XXX: Since the node will be case folded at compile
1344 time this logic is a little odd, although im not
1345 sure that its actually wrong. --dmq */
1347 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1348 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1350 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1351 codepoint of the first character in the converted
1352 form, yet originally we did the extra step.
1353 No tests fail by commenting this code out however
1354 so Ive left it out. -- dmq.
1356 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1358 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1363 while (sm < ((U8 *) m + ln)) {
1378 c2 = PL_fold_locale[c1];
1380 e = HOP3c(strend, -((I32)lnc), s);
1382 if (!reginfo && e < s)
1383 e = s; /* Due to minlen logic of intuit() */
1385 /* The idea in the EXACTF* cases is to first find the
1386 * first character of the EXACTF* node and then, if
1387 * necessary, case-insensitively compare the full
1388 * text of the node. The c1 and c2 are the first
1389 * characters (though in Unicode it gets a bit
1390 * more complicated because there are more cases
1391 * than just upper and lower: one needs to use
1392 * the so-called folding case for case-insensitive
1393 * matching (called "loose matching" in Unicode).
1394 * ibcmp_utf8() will do just that. */
1396 if (do_utf8 || UTF) {
1398 U8 tmpbuf [UTF8_MAXBYTES+1];
1401 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1403 /* Upper and lower of 1st char are equal -
1404 * probably not a "letter". */
1407 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1412 REXEC_FBC_EXACTISH_CHECK(c == c1);
1418 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1424 /* Handle some of the three Greek sigmas cases.
1425 * Note that not all the possible combinations
1426 * are handled here: some of them are handled
1427 * by the standard folding rules, and some of
1428 * them (the character class or ANYOF cases)
1429 * are handled during compiletime in
1430 * regexec.c:S_regclass(). */
1431 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1432 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1433 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1435 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1440 /* Neither pattern nor string are UTF8 */
1442 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1444 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1448 PL_reg_flags |= RF_tainted;
1455 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1456 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1458 tmp = ((OP(c) == BOUND ?
1459 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1460 LOAD_UTF8_CHARCLASS_ALNUM();
1461 REXEC_FBC_UTF8_SCAN(
1462 if (tmp == !(OP(c) == BOUND ?
1463 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1464 isALNUM_LC_utf8((U8*)s)))
1472 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1473 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1476 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1482 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1486 PL_reg_flags |= RF_tainted;
1493 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1494 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1496 tmp = ((OP(c) == NBOUND ?
1497 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1498 LOAD_UTF8_CHARCLASS_ALNUM();
1499 REXEC_FBC_UTF8_SCAN(
1500 if (tmp == !(OP(c) == NBOUND ?
1501 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1502 isALNUM_LC_utf8((U8*)s)))
1504 else REXEC_FBC_TRYIT;
1508 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1509 tmp = ((OP(c) == NBOUND ?
1510 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1513 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1515 else REXEC_FBC_TRYIT;
1518 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1522 REXEC_FBC_CSCAN_PRELOAD(
1523 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1524 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1528 REXEC_FBC_CSCAN_TAINT(
1529 isALNUM_LC_utf8((U8*)s),
1533 REXEC_FBC_CSCAN_PRELOAD(
1534 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1535 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1539 REXEC_FBC_CSCAN_TAINT(
1540 !isALNUM_LC_utf8((U8*)s),
1544 REXEC_FBC_CSCAN_PRELOAD(
1545 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1546 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1550 REXEC_FBC_CSCAN_TAINT(
1551 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1555 REXEC_FBC_CSCAN_PRELOAD(
1556 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1557 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1561 REXEC_FBC_CSCAN_TAINT(
1562 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1566 REXEC_FBC_CSCAN_PRELOAD(
1567 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1568 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1572 REXEC_FBC_CSCAN_TAINT(
1573 isDIGIT_LC_utf8((U8*)s),
1577 REXEC_FBC_CSCAN_PRELOAD(
1578 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1579 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1583 REXEC_FBC_CSCAN_TAINT(
1584 !isDIGIT_LC_utf8((U8*)s),
1590 is_LNBREAK_latin1(s)
1600 !is_VERTWS_latin1(s)
1605 is_HORIZWS_latin1(s)
1609 !is_HORIZWS_utf8(s),
1610 !is_HORIZWS_latin1(s)
1616 /* what trie are we using right now */
1618 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1620 = (reg_trie_data*)progi->data->data[ aho->trie ];
1621 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1623 const char *last_start = strend - trie->minlen;
1625 const char *real_start = s;
1627 STRLEN maxlen = trie->maxlen;
1629 U8 **points; /* map of where we were in the input string
1630 when reading a given char. For ASCII this
1631 is unnecessary overhead as the relationship
1632 is always 1:1, but for Unicode, especially
1633 case folded Unicode this is not true. */
1634 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1638 GET_RE_DEBUG_FLAGS_DECL;
1640 /* We can't just allocate points here. We need to wrap it in
1641 * an SV so it gets freed properly if there is a croak while
1642 * running the match */
1645 sv_points=newSV(maxlen * sizeof(U8 *));
1646 SvCUR_set(sv_points,
1647 maxlen * sizeof(U8 *));
1648 SvPOK_on(sv_points);
1649 sv_2mortal(sv_points);
1650 points=(U8**)SvPV_nolen(sv_points );
1651 if ( trie_type != trie_utf8_fold
1652 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1655 bitmap=(U8*)trie->bitmap;
1657 bitmap=(U8*)ANYOF_BITMAP(c);
1659 /* this is the Aho-Corasick algorithm modified a touch
1660 to include special handling for long "unknown char"
1661 sequences. The basic idea being that we use AC as long
1662 as we are dealing with a possible matching char, when
1663 we encounter an unknown char (and we have not encountered
1664 an accepting state) we scan forward until we find a legal
1666 AC matching is basically that of trie matching, except
1667 that when we encounter a failing transition, we fall back
1668 to the current states "fail state", and try the current char
1669 again, a process we repeat until we reach the root state,
1670 state 1, or a legal transition. If we fail on the root state
1671 then we can either terminate if we have reached an accepting
1672 state previously, or restart the entire process from the beginning
1676 while (s <= last_start) {
1677 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1685 U8 *uscan = (U8*)NULL;
1686 U8 *leftmost = NULL;
1688 U32 accepted_word= 0;
1692 while ( state && uc <= (U8*)strend ) {
1694 U32 word = aho->states[ state ].wordnum;
1698 DEBUG_TRIE_EXECUTE_r(
1699 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1700 dump_exec_pos( (char *)uc, c, strend, real_start,
1701 (char *)uc, do_utf8 );
1702 PerlIO_printf( Perl_debug_log,
1703 " Scanning for legal start char...\n");
1706 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1711 if (uc >(U8*)last_start) break;
1715 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1716 if (!leftmost || lpos < leftmost) {
1717 DEBUG_r(accepted_word=word);
1723 points[pointpos++ % maxlen]= uc;
1724 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1725 uscan, len, uvc, charid, foldlen,
1727 DEBUG_TRIE_EXECUTE_r({
1728 dump_exec_pos( (char *)uc, c, strend, real_start,
1730 PerlIO_printf(Perl_debug_log,
1731 " Charid:%3u CP:%4"UVxf" ",
1737 word = aho->states[ state ].wordnum;
1739 base = aho->states[ state ].trans.base;
1741 DEBUG_TRIE_EXECUTE_r({
1743 dump_exec_pos( (char *)uc, c, strend, real_start,
1745 PerlIO_printf( Perl_debug_log,
1746 "%sState: %4"UVxf", word=%"UVxf,
1747 failed ? " Fail transition to " : "",
1748 (UV)state, (UV)word);
1753 (base + charid > trie->uniquecharcount )
1754 && (base + charid - 1 - trie->uniquecharcount
1756 && trie->trans[base + charid - 1 -
1757 trie->uniquecharcount].check == state
1758 && (tmp=trie->trans[base + charid - 1 -
1759 trie->uniquecharcount ].next))
1761 DEBUG_TRIE_EXECUTE_r(
1762 PerlIO_printf( Perl_debug_log," - legal\n"));
1767 DEBUG_TRIE_EXECUTE_r(
1768 PerlIO_printf( Perl_debug_log," - fail\n"));
1770 state = aho->fail[state];
1774 /* we must be accepting here */
1775 DEBUG_TRIE_EXECUTE_r(
1776 PerlIO_printf( Perl_debug_log," - accepting\n"));
1785 if (!state) state = 1;
1788 if ( aho->states[ state ].wordnum ) {
1789 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1790 if (!leftmost || lpos < leftmost) {
1791 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1796 s = (char*)leftmost;
1797 DEBUG_TRIE_EXECUTE_r({
1799 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1800 (UV)accepted_word, (IV)(s - real_start)
1803 if (!reginfo || regtry(reginfo, &s)) {
1809 DEBUG_TRIE_EXECUTE_r({
1810 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1813 DEBUG_TRIE_EXECUTE_r(
1814 PerlIO_printf( Perl_debug_log,"No match.\n"));
1823 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1833 - regexec_flags - match a regexp against a string
1836 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1837 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1838 /* strend: pointer to null at end of string */
1839 /* strbeg: real beginning of string */
1840 /* minend: end of match must be >=minend after stringarg. */
1841 /* data: May be used for some additional optimizations.
1842 Currently its only used, with a U32 cast, for transmitting
1843 the ganch offset when doing a /g match. This will change */
1844 /* nosave: For optimizations. */
1847 struct regexp *const prog = (struct regexp *)SvANY(rx);
1848 /*register*/ char *s;
1849 register regnode *c;
1850 /*register*/ char *startpos = stringarg;
1851 I32 minlen; /* must match at least this many chars */
1852 I32 dontbother = 0; /* how many characters not to try at end */
1853 I32 end_shift = 0; /* Same for the end. */ /* CC */
1854 I32 scream_pos = -1; /* Internal iterator of scream. */
1855 char *scream_olds = NULL;
1856 const bool do_utf8 = (bool)DO_UTF8(sv);
1858 RXi_GET_DECL(prog,progi);
1859 regmatch_info reginfo; /* create some info to pass to regtry etc */
1860 regexp_paren_pair *swap = NULL;
1861 GET_RE_DEBUG_FLAGS_DECL;
1863 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1864 PERL_UNUSED_ARG(data);
1866 /* Be paranoid... */
1867 if (prog == NULL || startpos == NULL) {
1868 Perl_croak(aTHX_ "NULL regexp parameter");
1872 multiline = prog->extflags & RXf_PMf_MULTILINE;
1873 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1875 RX_MATCH_UTF8_set(rx, do_utf8);
1877 debug_start_match(rx, do_utf8, startpos, strend,
1881 minlen = prog->minlen;
1883 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1884 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1885 "String too short [regexec_flags]...\n"));
1890 /* Check validity of program. */
1891 if (UCHARAT(progi->program) != REG_MAGIC) {
1892 Perl_croak(aTHX_ "corrupted regexp program");
1896 PL_reg_eval_set = 0;
1900 PL_reg_flags |= RF_utf8;
1902 /* Mark beginning of line for ^ and lookbehind. */
1903 reginfo.bol = startpos; /* XXX not used ??? */
1907 /* Mark end of line for $ (and such) */
1910 /* see how far we have to get to not match where we matched before */
1911 reginfo.till = startpos+minend;
1913 /* If there is a "must appear" string, look for it. */
1916 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1918 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
1919 reginfo.ganch = startpos + prog->gofs;
1920 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1921 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1922 } else if (sv && SvTYPE(sv) >= SVt_PVMG
1924 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1925 && mg->mg_len >= 0) {
1926 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1927 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1928 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1930 if (prog->extflags & RXf_ANCH_GPOS) {
1931 if (s > reginfo.ganch)
1933 s = reginfo.ganch - prog->gofs;
1934 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1935 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1941 reginfo.ganch = strbeg + PTR2UV(data);
1942 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1943 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1945 } else { /* pos() not defined */
1946 reginfo.ganch = strbeg;
1947 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1948 "GPOS: reginfo.ganch = strbeg\n"));
1951 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1952 /* We have to be careful. If the previous successful match
1953 was from this regex we don't want a subsequent partially
1954 successful match to clobber the old results.
1955 So when we detect this possibility we add a swap buffer
1956 to the re, and switch the buffer each match. If we fail
1957 we switch it back, otherwise we leave it swapped.
1960 /* do we need a save destructor here for eval dies? */
1961 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1963 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1964 re_scream_pos_data d;
1966 d.scream_olds = &scream_olds;
1967 d.scream_pos = &scream_pos;
1968 s = re_intuit_start(rx, sv, s, strend, flags, &d);
1970 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1971 goto phooey; /* not present */
1977 /* Simplest case: anchored match need be tried only once. */
1978 /* [unless only anchor is BOL and multiline is set] */
1979 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1980 if (s == startpos && regtry(®info, &startpos))
1982 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1983 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1988 dontbother = minlen - 1;
1989 end = HOP3c(strend, -dontbother, strbeg) - 1;
1990 /* for multiline we only have to try after newlines */
1991 if (prog->check_substr || prog->check_utf8) {
1995 if (regtry(®info, &s))
2000 if (prog->extflags & RXf_USE_INTUIT) {
2001 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2012 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2013 if (regtry(®info, &s))
2020 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2022 /* the warning about reginfo.ganch being used without intialization
2023 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2024 and we only enter this block when the same bit is set. */
2025 char *tmp_s = reginfo.ganch - prog->gofs;
2027 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2032 /* Messy cases: unanchored match. */
2033 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2034 /* we have /x+whatever/ */
2035 /* it must be a one character string (XXXX Except UTF?) */
2040 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2041 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2042 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2047 DEBUG_EXECUTE_r( did_match = 1 );
2048 if (regtry(®info, &s)) goto got_it;
2050 while (s < strend && *s == ch)
2058 DEBUG_EXECUTE_r( did_match = 1 );
2059 if (regtry(®info, &s)) goto got_it;
2061 while (s < strend && *s == ch)
2066 DEBUG_EXECUTE_r(if (!did_match)
2067 PerlIO_printf(Perl_debug_log,
2068 "Did not find anchored character...\n")
2071 else if (prog->anchored_substr != NULL
2072 || prog->anchored_utf8 != NULL
2073 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2074 && prog->float_max_offset < strend - s)) {
2079 char *last1; /* Last position checked before */
2083 if (prog->anchored_substr || prog->anchored_utf8) {
2084 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2085 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2086 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2087 back_max = back_min = prog->anchored_offset;
2089 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2090 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2091 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2092 back_max = prog->float_max_offset;
2093 back_min = prog->float_min_offset;
2097 if (must == &PL_sv_undef)
2098 /* could not downgrade utf8 check substring, so must fail */
2104 last = HOP3c(strend, /* Cannot start after this */
2105 -(I32)(CHR_SVLEN(must)
2106 - (SvTAIL(must) != 0) + back_min), strbeg);
2109 last1 = HOPc(s, -1);
2111 last1 = s - 1; /* bogus */
2113 /* XXXX check_substr already used to find "s", can optimize if
2114 check_substr==must. */
2116 dontbother = end_shift;
2117 strend = HOPc(strend, -dontbother);
2118 while ( (s <= last) &&
2119 ((flags & REXEC_SCREAM)
2120 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2121 end_shift, &scream_pos, 0))
2122 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2123 (unsigned char*)strend, must,
2124 multiline ? FBMrf_MULTILINE : 0))) ) {
2125 /* we may be pointing at the wrong string */
2126 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2127 s = strbeg + (s - SvPVX_const(sv));
2128 DEBUG_EXECUTE_r( did_match = 1 );
2129 if (HOPc(s, -back_max) > last1) {
2130 last1 = HOPc(s, -back_min);
2131 s = HOPc(s, -back_max);
2134 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2136 last1 = HOPc(s, -back_min);
2140 while (s <= last1) {
2141 if (regtry(®info, &s))
2147 while (s <= last1) {
2148 if (regtry(®info, &s))
2154 DEBUG_EXECUTE_r(if (!did_match) {
2155 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2156 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2157 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2158 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2159 ? "anchored" : "floating"),
2160 quoted, RE_SV_TAIL(must));
2164 else if ( (c = progi->regstclass) ) {
2166 const OPCODE op = OP(progi->regstclass);
2167 /* don't bother with what can't match */
2168 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2169 strend = HOPc(strend, -(minlen - 1));
2172 SV * const prop = sv_newmortal();
2173 regprop(prog, prop, c);
2175 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2177 PerlIO_printf(Perl_debug_log,
2178 "Matching stclass %.*s against %s (%d chars)\n",
2179 (int)SvCUR(prop), SvPVX_const(prop),
2180 quoted, (int)(strend - s));
2183 if (find_byclass(prog, c, s, strend, ®info))
2185 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2189 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2194 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2195 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2196 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2198 if (flags & REXEC_SCREAM) {
2199 last = screaminstr(sv, float_real, s - strbeg,
2200 end_shift, &scream_pos, 1); /* last one */
2202 last = scream_olds; /* Only one occurrence. */
2203 /* we may be pointing at the wrong string */
2204 else if (RXp_MATCH_COPIED(prog))
2205 s = strbeg + (s - SvPVX_const(sv));
2209 const char * const little = SvPV_const(float_real, len);
2211 if (SvTAIL(float_real)) {
2212 if (memEQ(strend - len + 1, little, len - 1))
2213 last = strend - len + 1;
2214 else if (!multiline)
2215 last = memEQ(strend - len, little, len)
2216 ? strend - len : NULL;
2222 last = rninstr(s, strend, little, little + len);
2224 last = strend; /* matching "$" */
2229 PerlIO_printf(Perl_debug_log,
2230 "%sCan't trim the tail, match fails (should not happen)%s\n",
2231 PL_colors[4], PL_colors[5]));
2232 goto phooey; /* Should not happen! */
2234 dontbother = strend - last + prog->float_min_offset;
2236 if (minlen && (dontbother < minlen))
2237 dontbother = minlen - 1;
2238 strend -= dontbother; /* this one's always in bytes! */
2239 /* We don't know much -- general case. */
2242 if (regtry(®info, &s))
2251 if (regtry(®info, &s))
2253 } while (s++ < strend);
2262 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2264 if (PL_reg_eval_set)
2265 restore_pos(aTHX_ prog);
2266 if (RXp_PAREN_NAMES(prog))
2267 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2269 /* make sure $`, $&, $', and $digit will work later */
2270 if ( !(flags & REXEC_NOT_FIRST) ) {
2271 RX_MATCH_COPY_FREE(rx);
2272 if (flags & REXEC_COPY_STR) {
2273 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2274 #ifdef PERL_OLD_COPY_ON_WRITE
2276 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2278 PerlIO_printf(Perl_debug_log,
2279 "Copy on write: regexp capture, type %d\n",
2282 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2283 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2284 assert (SvPOKp(prog->saved_copy));
2288 RX_MATCH_COPIED_on(rx);
2289 s = savepvn(strbeg, i);
2295 prog->subbeg = strbeg;
2296 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2303 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2304 PL_colors[4], PL_colors[5]));
2305 if (PL_reg_eval_set)
2306 restore_pos(aTHX_ prog);
2308 /* we failed :-( roll it back */
2309 Safefree(prog->offs);
2318 - regtry - try match at specific point
2320 STATIC I32 /* 0 failure, 1 success */
2321 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2325 REGEXP *const rx = reginfo->prog;
2326 regexp *const prog = (struct regexp *)SvANY(rx);
2327 RXi_GET_DECL(prog,progi);
2328 GET_RE_DEBUG_FLAGS_DECL;
2330 PERL_ARGS_ASSERT_REGTRY;
2332 reginfo->cutpoint=NULL;
2334 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2337 PL_reg_eval_set = RS_init;
2338 DEBUG_EXECUTE_r(DEBUG_s(
2339 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2340 (IV)(PL_stack_sp - PL_stack_base));
2343 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2344 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2346 /* Apparently this is not needed, judging by wantarray. */
2347 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2348 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2351 /* Make $_ available to executed code. */
2352 if (reginfo->sv != DEFSV) {
2354 DEFSV_set(reginfo->sv);
2357 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2358 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2359 /* prepare for quick setting of pos */
2360 #ifdef PERL_OLD_COPY_ON_WRITE
2361 if (SvIsCOW(reginfo->sv))
2362 sv_force_normal_flags(reginfo->sv, 0);
2364 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2365 &PL_vtbl_mglob, NULL, 0);
2369 PL_reg_oldpos = mg->mg_len;
2370 SAVEDESTRUCTOR_X(restore_pos, prog);
2372 if (!PL_reg_curpm) {
2373 Newxz(PL_reg_curpm, 1, PMOP);
2376 SV* const repointer = &PL_sv_undef;
2377 /* this regexp is also owned by the new PL_reg_curpm, which
2378 will try to free it. */
2379 av_push(PL_regex_padav, repointer);
2380 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2381 PL_regex_pad = AvARRAY(PL_regex_padav);
2386 /* It seems that non-ithreads works both with and without this code.
2387 So for efficiency reasons it seems best not to have the code
2388 compiled when it is not needed. */
2389 /* This is safe against NULLs: */
2390 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2391 /* PM_reg_curpm owns a reference to this regexp. */
2394 PM_SETRE(PL_reg_curpm, rx);
2395 PL_reg_oldcurpm = PL_curpm;
2396 PL_curpm = PL_reg_curpm;
2397 if (RXp_MATCH_COPIED(prog)) {
2398 /* Here is a serious problem: we cannot rewrite subbeg,
2399 since it may be needed if this match fails. Thus
2400 $` inside (?{}) could fail... */
2401 PL_reg_oldsaved = prog->subbeg;
2402 PL_reg_oldsavedlen = prog->sublen;
2403 #ifdef PERL_OLD_COPY_ON_WRITE
2404 PL_nrs = prog->saved_copy;
2406 RXp_MATCH_COPIED_off(prog);
2409 PL_reg_oldsaved = NULL;
2410 prog->subbeg = PL_bostr;
2411 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2413 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2414 prog->offs[0].start = *startpos - PL_bostr;
2415 PL_reginput = *startpos;
2416 PL_reglastparen = &prog->lastparen;
2417 PL_reglastcloseparen = &prog->lastcloseparen;
2418 prog->lastparen = 0;
2419 prog->lastcloseparen = 0;
2421 PL_regoffs = prog->offs;
2422 if (PL_reg_start_tmpl <= prog->nparens) {
2423 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2424 if(PL_reg_start_tmp)
2425 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2427 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2430 /* XXXX What this code is doing here?!!! There should be no need
2431 to do this again and again, PL_reglastparen should take care of
2434 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2435 * Actually, the code in regcppop() (which Ilya may be meaning by
2436 * PL_reglastparen), is not needed at all by the test suite
2437 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2438 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2439 * Meanwhile, this code *is* needed for the
2440 * above-mentioned test suite tests to succeed. The common theme
2441 * on those tests seems to be returning null fields from matches.
2442 * --jhi updated by dapm */
2444 if (prog->nparens) {
2445 regexp_paren_pair *pp = PL_regoffs;
2447 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2455 if (regmatch(reginfo, progi->program + 1)) {
2456 PL_regoffs[0].end = PL_reginput - PL_bostr;
2459 if (reginfo->cutpoint)
2460 *startpos= reginfo->cutpoint;
2461 REGCP_UNWIND(lastcp);
2466 #define sayYES goto yes
2467 #define sayNO goto no
2468 #define sayNO_SILENT goto no_silent
2470 /* we dont use STMT_START/END here because it leads to
2471 "unreachable code" warnings, which are bogus, but distracting. */
2472 #define CACHEsayNO \
2473 if (ST.cache_mask) \
2474 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2477 /* this is used to determine how far from the left messages like
2478 'failed...' are printed. It should be set such that messages
2479 are inline with the regop output that created them.
2481 #define REPORT_CODE_OFF 32
2484 /* Make sure there is a test for this +1 options in re_tests */
2485 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2487 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2488 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2490 #define SLAB_FIRST(s) (&(s)->states[0])
2491 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2493 /* grab a new slab and return the first slot in it */
2495 STATIC regmatch_state *
2498 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2501 regmatch_slab *s = PL_regmatch_slab->next;
2503 Newx(s, 1, regmatch_slab);
2504 s->prev = PL_regmatch_slab;
2506 PL_regmatch_slab->next = s;
2508 PL_regmatch_slab = s;
2509 return SLAB_FIRST(s);
2513 /* push a new state then goto it */
2515 #define PUSH_STATE_GOTO(state, node) \
2517 st->resume_state = state; \
2520 /* push a new state with success backtracking, then goto it */
2522 #define PUSH_YES_STATE_GOTO(state, node) \
2524 st->resume_state = state; \
2525 goto push_yes_state;
2531 regmatch() - main matching routine
2533 This is basically one big switch statement in a loop. We execute an op,
2534 set 'next' to point the next op, and continue. If we come to a point which
2535 we may need to backtrack to on failure such as (A|B|C), we push a
2536 backtrack state onto the backtrack stack. On failure, we pop the top
2537 state, and re-enter the loop at the state indicated. If there are no more
2538 states to pop, we return failure.
2540 Sometimes we also need to backtrack on success; for example /A+/, where
2541 after successfully matching one A, we need to go back and try to
2542 match another one; similarly for lookahead assertions: if the assertion
2543 completes successfully, we backtrack to the state just before the assertion
2544 and then carry on. In these cases, the pushed state is marked as
2545 'backtrack on success too'. This marking is in fact done by a chain of
2546 pointers, each pointing to the previous 'yes' state. On success, we pop to
2547 the nearest yes state, discarding any intermediate failure-only states.
2548 Sometimes a yes state is pushed just to force some cleanup code to be
2549 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2550 it to free the inner regex.
2552 Note that failure backtracking rewinds the cursor position, while
2553 success backtracking leaves it alone.
2555 A pattern is complete when the END op is executed, while a subpattern
2556 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2557 ops trigger the "pop to last yes state if any, otherwise return true"
2560 A common convention in this function is to use A and B to refer to the two
2561 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2562 the subpattern to be matched possibly multiple times, while B is the entire
2563 rest of the pattern. Variable and state names reflect this convention.
2565 The states in the main switch are the union of ops and failure/success of
2566 substates associated with with that op. For example, IFMATCH is the op
2567 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2568 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2569 successfully matched A and IFMATCH_A_fail is a state saying that we have
2570 just failed to match A. Resume states always come in pairs. The backtrack
2571 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2572 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2573 on success or failure.
2575 The struct that holds a backtracking state is actually a big union, with
2576 one variant for each major type of op. The variable st points to the
2577 top-most backtrack struct. To make the code clearer, within each
2578 block of code we #define ST to alias the relevant union.
2580 Here's a concrete example of a (vastly oversimplified) IFMATCH
2586 #define ST st->u.ifmatch
2588 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2589 ST.foo = ...; // some state we wish to save
2591 // push a yes backtrack state with a resume value of
2592 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2594 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2597 case IFMATCH_A: // we have successfully executed A; now continue with B
2599 bar = ST.foo; // do something with the preserved value
2602 case IFMATCH_A_fail: // A failed, so the assertion failed
2603 ...; // do some housekeeping, then ...
2604 sayNO; // propagate the failure
2611 For any old-timers reading this who are familiar with the old recursive
2612 approach, the code above is equivalent to:
2614 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2623 ...; // do some housekeeping, then ...
2624 sayNO; // propagate the failure
2627 The topmost backtrack state, pointed to by st, is usually free. If you
2628 want to claim it, populate any ST.foo fields in it with values you wish to
2629 save, then do one of
2631 PUSH_STATE_GOTO(resume_state, node);
2632 PUSH_YES_STATE_GOTO(resume_state, node);
2634 which sets that backtrack state's resume value to 'resume_state', pushes a
2635 new free entry to the top of the backtrack stack, then goes to 'node'.
2636 On backtracking, the free slot is popped, and the saved state becomes the
2637 new free state. An ST.foo field in this new top state can be temporarily
2638 accessed to retrieve values, but once the main loop is re-entered, it
2639 becomes available for reuse.
2641 Note that the depth of the backtrack stack constantly increases during the
2642 left-to-right execution of the pattern, rather than going up and down with
2643 the pattern nesting. For example the stack is at its maximum at Z at the
2644 end of the pattern, rather than at X in the following:
2646 /(((X)+)+)+....(Y)+....Z/
2648 The only exceptions to this are lookahead/behind assertions and the cut,
2649 (?>A), which pop all the backtrack states associated with A before
2652 Bascktrack state structs are allocated in slabs of about 4K in size.
2653 PL_regmatch_state and st always point to the currently active state,
2654 and PL_regmatch_slab points to the slab currently containing
2655 PL_regmatch_state. The first time regmatch() is called, the first slab is
2656 allocated, and is never freed until interpreter destruction. When the slab
2657 is full, a new one is allocated and chained to the end. At exit from
2658 regmatch(), slabs allocated since entry are freed.
2663 #define DEBUG_STATE_pp(pp) \
2665 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2666 PerlIO_printf(Perl_debug_log, \
2667 " %*s"pp" %s%s%s%s%s\n", \
2669 PL_reg_name[st->resume_state], \
2670 ((st==yes_state||st==mark_state) ? "[" : ""), \
2671 ((st==yes_state) ? "Y" : ""), \
2672 ((st==mark_state) ? "M" : ""), \
2673 ((st==yes_state||st==mark_state) ? "]" : "") \
2678 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2683 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2684 const char *start, const char *end, const char *blurb)
2686 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2688 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2693 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2694 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2696 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2697 start, end - start, 60);
2699 PerlIO_printf(Perl_debug_log,
2700 "%s%s REx%s %s against %s\n",
2701 PL_colors[4], blurb, PL_colors[5], s0, s1);
2703 if (do_utf8||utf8_pat)
2704 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2705 utf8_pat ? "pattern" : "",
2706 utf8_pat && do_utf8 ? " and " : "",
2707 do_utf8 ? "string" : ""
2713 S_dump_exec_pos(pTHX_ const char *locinput,
2714 const regnode *scan,
2715 const char *loc_regeol,
2716 const char *loc_bostr,
2717 const char *loc_reg_starttry,
2720 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2721 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2722 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2723 /* The part of the string before starttry has one color
2724 (pref0_len chars), between starttry and current
2725 position another one (pref_len - pref0_len chars),
2726 after the current position the third one.
2727 We assume that pref0_len <= pref_len, otherwise we
2728 decrease pref0_len. */
2729 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2730 ? (5 + taill) - l : locinput - loc_bostr;
2733 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2735 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2737 pref0_len = pref_len - (locinput - loc_reg_starttry);
2738 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2739 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2740 ? (5 + taill) - pref_len : loc_regeol - locinput);
2741 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2745 if (pref0_len > pref_len)
2746 pref0_len = pref_len;
2748 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2750 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2751 (locinput - pref_len),pref0_len, 60, 4, 5);
2753 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2754 (locinput - pref_len + pref0_len),
2755 pref_len - pref0_len, 60, 2, 3);
2757 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2758 locinput, loc_regeol - locinput, 10, 0, 1);
2760 const STRLEN tlen=len0+len1+len2;
2761 PerlIO_printf(Perl_debug_log,
2762 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2763 (IV)(locinput - loc_bostr),
2766 (docolor ? "" : "> <"),
2768 (int)(tlen > 19 ? 0 : 19 - tlen),
2775 /* reg_check_named_buff_matched()
2776 * Checks to see if a named buffer has matched. The data array of
2777 * buffer numbers corresponding to the buffer is expected to reside
2778 * in the regexp->data->data array in the slot stored in the ARG() of
2779 * node involved. Note that this routine doesn't actually care about the
2780 * name, that information is not preserved from compilation to execution.
2781 * Returns the index of the leftmost defined buffer with the given name
2782 * or 0 if non of the buffers matched.
2785 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2788 RXi_GET_DECL(rex,rexi);
2789 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2790 I32 *nums=(I32*)SvPVX(sv_dat);
2792 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2794 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2795 if ((I32)*PL_reglastparen >= nums[n] &&
2796 PL_regoffs[nums[n]].end != -1)
2805 /* free all slabs above current one - called during LEAVE_SCOPE */
2808 S_clear_backtrack_stack(pTHX_ void *p)
2810 regmatch_slab *s = PL_regmatch_slab->next;
2815 PL_regmatch_slab->next = NULL;
2817 regmatch_slab * const osl = s;
2824 #define SETREX(Re1,Re2) \
2825 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2828 STATIC I32 /* 0 failure, 1 success */
2829 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2831 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2835 register const bool do_utf8 = PL_reg_match_utf8;
2836 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2837 REGEXP *rex_sv = reginfo->prog;
2838 regexp *rex = (struct regexp *)SvANY(rex_sv);
2839 RXi_GET_DECL(rex,rexi);
2841 /* the current state. This is a cached copy of PL_regmatch_state */
2842 register regmatch_state *st;
2843 /* cache heavy used fields of st in registers */
2844 register regnode *scan;
2845 register regnode *next;
2846 register U32 n = 0; /* general value; init to avoid compiler warning */
2847 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2848 register char *locinput = PL_reginput;
2849 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2851 bool result = 0; /* return value of S_regmatch */
2852 int depth = 0; /* depth of backtrack stack */
2853 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2854 const U32 max_nochange_depth =
2855 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2856 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2857 regmatch_state *yes_state = NULL; /* state to pop to on success of
2859 /* mark_state piggy backs on the yes_state logic so that when we unwind
2860 the stack on success we can update the mark_state as we go */
2861 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2862 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2863 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2865 bool no_final = 0; /* prevent failure from backtracking? */
2866 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2867 char *startpoint = PL_reginput;
2868 SV *popmark = NULL; /* are we looking for a mark? */
2869 SV *sv_commit = NULL; /* last mark name seen in failure */
2870 SV *sv_yes_mark = NULL; /* last mark name we have seen
2871 during a successfull match */
2872 U32 lastopen = 0; /* last open we saw */
2873 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2874 SV* const oreplsv = GvSV(PL_replgv);
2875 /* these three flags are set by various ops to signal information to
2876 * the very next op. They have a useful lifetime of exactly one loop
2877 * iteration, and are not preserved or restored by state pushes/pops
2879 bool sw = 0; /* the condition value in (?(cond)a|b) */
2880 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2881 int logical = 0; /* the following EVAL is:
2885 or the following IFMATCH/UNLESSM is:
2886 false: plain (?=foo)
2887 true: used as a condition: (?(?=foo))
2890 GET_RE_DEBUG_FLAGS_DECL;
2893 PERL_ARGS_ASSERT_REGMATCH;
2895 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2896 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2898 /* on first ever call to regmatch, allocate first slab */
2899 if (!PL_regmatch_slab) {
2900 Newx(PL_regmatch_slab, 1, regmatch_slab);
2901 PL_regmatch_slab->prev = NULL;
2902 PL_regmatch_slab->next = NULL;
2903 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2906 oldsave = PL_savestack_ix;
2907 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2908 SAVEVPTR(PL_regmatch_slab);
2909 SAVEVPTR(PL_regmatch_state);
2911 /* grab next free state slot */
2912 st = ++PL_regmatch_state;
2913 if (st > SLAB_LAST(PL_regmatch_slab))
2914 st = PL_regmatch_state = S_push_slab(aTHX);
2916 /* Note that nextchr is a byte even in UTF */
2917 nextchr = UCHARAT(locinput);
2919 while (scan != NULL) {
2922 SV * const prop = sv_newmortal();
2923 regnode *rnext=regnext(scan);
2924 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2925 regprop(rex, prop, scan);
2927 PerlIO_printf(Perl_debug_log,
2928 "%3"IVdf":%*s%s(%"IVdf")\n",
2929 (IV)(scan - rexi->program), depth*2, "",
2931 (PL_regkind[OP(scan)] == END || !rnext) ?
2932 0 : (IV)(rnext - rexi->program));
2935 next = scan + NEXT_OFF(scan);
2938 state_num = OP(scan);
2940 REH_CALL_REGEXEC_HOOK(rex, scan, reginfo, st);
2943 assert(PL_reglastparen == &rex->lastparen);
2944 assert(PL_reglastcloseparen == &rex->lastcloseparen);
2945 assert(PL_regoffs == rex->offs);
2947 switch (state_num) {
2949 if (locinput == PL_bostr)
2951 /* reginfo->till = reginfo->bol; */
2956 if (locinput == PL_bostr ||
2957 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2963 if (locinput == PL_bostr)
2967 if (locinput == reginfo->ganch)
2972 /* update the startpoint */
2973 st->u.keeper.val = PL_regoffs[0].start;
2974 PL_reginput = locinput;
2975 PL_regoffs[0].start = locinput - PL_bostr;
2976 PUSH_STATE_GOTO(KEEPS_next, next);
2978 case KEEPS_next_fail:
2979 /* rollback the start point change */
2980 PL_regoffs[0].start = st->u.keeper.val;
2986 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2991 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2993 if (PL_regeol - locinput > 1)
2997 if (PL_regeol != locinput)
3001 if (!nextchr && locinput >= PL_regeol)
3004 locinput += PL_utf8skip[nextchr];
3005 if (locinput > PL_regeol)
3007 nextchr = UCHARAT(locinput);
3010 nextchr = UCHARAT(++locinput);
3013 if (!nextchr && locinput >= PL_regeol)
3015 nextchr = UCHARAT(++locinput);
3018 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3021 locinput += PL_utf8skip[nextchr];
3022 if (locinput > PL_regeol)
3024 nextchr = UCHARAT(locinput);
3027 nextchr = UCHARAT(++locinput);
3031 #define ST st->u.trie
3033 /* In this case the charclass data is available inline so
3034 we can fail fast without a lot of extra overhead.
3036 if (scan->flags == EXACT || !do_utf8) {
3037 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3039 PerlIO_printf(Perl_debug_log,
3040 "%*s %sfailed to match trie start class...%s\n",
3041 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3050 /* what type of TRIE am I? (utf8 makes this contextual) */
3051 DECL_TRIE_TYPE(scan);
3053 /* what trie are we using right now */
3054 reg_trie_data * const trie
3055 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3056 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3057 U32 state = trie->startstate;
3059 if (trie->bitmap && trie_type != trie_utf8_fold &&
3060 !TRIE_BITMAP_TEST(trie,*locinput)
3062 if (trie->states[ state ].wordnum) {
3064 PerlIO_printf(Perl_debug_log,
3065 "%*s %smatched empty string...%s\n",
3066 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3071 PerlIO_printf(Perl_debug_log,
3072 "%*s %sfailed to match trie start class...%s\n",
3073 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3080 U8 *uc = ( U8* )locinput;
3084 U8 *uscan = (U8*)NULL;
3086 SV *sv_accept_buff = NULL;
3087 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3089 ST.accepted = 0; /* how many accepting states we have seen */
3091 ST.jump = trie->jump;
3094 traverse the TRIE keeping track of all accepting states
3095 we transition through until we get to a failing node.
3098 while ( state && uc <= (U8*)PL_regeol ) {
3099 U32 base = trie->states[ state ].trans.base;
3102 /* We use charid to hold the wordnum as we don't use it
3103 for charid until after we have done the wordnum logic.
3104 We define an alias just so that the wordnum logic reads
3107 #define got_wordnum charid
3108 got_wordnum = trie->states[ state ].wordnum;
3110 if ( got_wordnum ) {
3111 if ( ! ST.accepted ) {
3113 SAVETMPS; /* XXX is this necessary? dmq */
3114 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3115 sv_accept_buff=newSV(bufflen *
3116 sizeof(reg_trie_accepted) - 1);
3117 SvCUR_set(sv_accept_buff, 0);
3118 SvPOK_on(sv_accept_buff);
3119 sv_2mortal(sv_accept_buff);
3122 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3125 if (ST.accepted >= bufflen) {
3127 ST.accept_buff =(reg_trie_accepted*)
3128 SvGROW(sv_accept_buff,
3129 bufflen * sizeof(reg_trie_accepted));
3131 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3132 + sizeof(reg_trie_accepted));
3135 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3136 ST.accept_buff[ST.accepted].endpos = uc;
3138 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3142 DEBUG_TRIE_EXECUTE_r({
3143 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3144 PerlIO_printf( Perl_debug_log,
3145 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
3146 2+depth * 2, "", PL_colors[4],
3147 (UV)state, (UV)ST.accepted );
3151 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3152 uscan, len, uvc, charid, foldlen,
3156 (base + charid > trie->uniquecharcount )
3157 && (base + charid - 1 - trie->uniquecharcount
3159 && trie->trans[base + charid - 1 -
3160 trie->uniquecharcount].check == state)
3162 state = trie->trans[base + charid - 1 -
3163 trie->uniquecharcount ].next;
3174 DEBUG_TRIE_EXECUTE_r(
3175 PerlIO_printf( Perl_debug_log,
3176 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3177 charid, uvc, (UV)state, PL_colors[5] );
3184 PerlIO_printf( Perl_debug_log,
3185 "%*s %sgot %"IVdf" possible matches%s\n",
3186 REPORT_CODE_OFF + depth * 2, "",
3187 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3190 goto trie_first_try; /* jump into the fail handler */
3192 case TRIE_next_fail: /* we failed - try next alterative */
3194 REGCP_UNWIND(ST.cp);
3195 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3196 PL_regoffs[n].end = -1;
3197 *PL_reglastparen = n;
3206 ST.lastparen = *PL_reglastparen;
3209 if ( ST.accepted == 1 ) {
3210 /* only one choice left - just continue */
3212 AV *const trie_words
3213 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3214 SV ** const tmp = av_fetch( trie_words,
3215 ST.accept_buff[ 0 ].wordnum-1, 0 );
3216 SV *sv= tmp ? sv_newmortal() : NULL;
3218 PerlIO_printf( Perl_debug_log,
3219 "%*s %sonly one match left: #%d <%s>%s\n",
3220 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3221 ST.accept_buff[ 0 ].wordnum,
3222 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3223 PL_colors[0], PL_colors[1],
3224 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3226 : "not compiled under -Dr",
3229 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3230 /* in this case we free tmps/leave before we call regmatch
3231 as we wont be using accept_buff again. */
3233 locinput = PL_reginput;
3234 nextchr = UCHARAT(locinput);
3235 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3238 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3239 if (!has_cutgroup) {
3244 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3247 continue; /* execute rest of RE */
3250 if ( !ST.accepted-- ) {
3252 PerlIO_printf( Perl_debug_log,
3253 "%*s %sTRIE failed...%s\n",
3254 REPORT_CODE_OFF+depth*2, "",
3265 There are at least two accepting states left. Presumably
3266 the number of accepting states is going to be low,
3267 typically two. So we simply scan through to find the one
3268 with lowest wordnum. Once we find it, we swap the last
3269 state into its place and decrement the size. We then try to
3270 match the rest of the pattern at the point where the word
3271 ends. If we succeed, control just continues along the
3272 regex; if we fail we return here to try the next accepting
3279 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3280 DEBUG_TRIE_EXECUTE_r(
3281 PerlIO_printf( Perl_debug_log,
3282 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3283 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3284 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3285 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3288 if (ST.accept_buff[cur].wordnum <
3289 ST.accept_buff[best].wordnum)
3294 AV *const trie_words
3295 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3296 SV ** const tmp = av_fetch( trie_words,
3297 ST.accept_buff[ best ].wordnum - 1, 0 );
3298 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3300 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3301 SV *sv= tmp ? sv_newmortal() : NULL;
3303 PerlIO_printf( Perl_debug_log,
3304 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3305 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3306 ST.accept_buff[best].wordnum,
3307 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3308 PL_colors[0], PL_colors[1],
3309 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3310 ) : "not compiled under -Dr",
3311 REG_NODE_NUM(nextop),
3315 if ( best<ST.accepted ) {
3316 reg_trie_accepted tmp = ST.accept_buff[ best ];
3317 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3318 ST.accept_buff[ ST.accepted ] = tmp;
3321 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3322 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3325 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3327 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3332 /* we dont want to throw this away, see bug 57042*/
3333 if (oreplsv != GvSV(PL_replgv))
3334 sv_setsv(oreplsv, GvSV(PL_replgv));
3341 char *s = STRING(scan);
3343 if (do_utf8 != UTF) {
3344 /* The target and the pattern have differing utf8ness. */
3346 const char * const e = s + ln;
3349 /* The target is utf8, the pattern is not utf8. */
3354 if (NATIVE_TO_UNI(*(U8*)s) !=
3355 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3363 /* The target is not utf8, the pattern is utf8. */
3368 if (NATIVE_TO_UNI(*((U8*)l)) !=
3369 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3377 nextchr = UCHARAT(locinput);
3380 /* The target and the pattern have the same utf8ness. */
3381 /* Inline the first character, for speed. */
3382 if (UCHARAT(s) != nextchr)
3384 if (PL_regeol - locinput < ln)
3386 if (ln > 1 && memNE(s, locinput, ln))
3389 nextchr = UCHARAT(locinput);
3393 PL_reg_flags |= RF_tainted;
3396 char * const s = STRING(scan);
3399 if (do_utf8 || UTF) {
3400 /* Either target or the pattern are utf8. */
3401 const char * const l = locinput;
3402 char *e = PL_regeol;
3404 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3405 l, &e, 0, do_utf8)) {
3406 /* One more case for the sharp s:
3407 * pack("U0U*", 0xDF) =~ /ss/i,
3408 * the 0xC3 0x9F are the UTF-8
3409 * byte sequence for the U+00DF. */
3412 toLOWER(s[0]) == 's' &&
3414 toLOWER(s[1]) == 's' &&
3421 nextchr = UCHARAT(locinput);
3425 /* Neither the target and the pattern are utf8. */
3427 /* Inline the first character, for speed. */
3428 if (UCHARAT(s) != nextchr &&
3429 UCHARAT(s) != ((OP(scan) == EXACTF)
3430 ? PL_fold : PL_fold_locale)[nextchr])
3432 if (PL_regeol - locinput < ln)
3434 if (ln > 1 && (OP(scan) == EXACTF
3435 ? ibcmp(s, locinput, ln)
3436 : ibcmp_locale(s, locinput, ln)))
3439 nextchr = UCHARAT(locinput);
3444 PL_reg_flags |= RF_tainted;
3448 /* was last char in word? */
3450 if (locinput == PL_bostr)
3453 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3455 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3457 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3458 ln = isALNUM_uni(ln);
3459 LOAD_UTF8_CHARCLASS_ALNUM();
3460 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3463 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3464 n = isALNUM_LC_utf8((U8*)locinput);
3468 ln = (locinput != PL_bostr) ?
3469 UCHARAT(locinput - 1) : '\n';
3470 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3472 n = isALNUM(nextchr);
3475 ln = isALNUM_LC(ln);
3476 n = isALNUM_LC(nextchr);
3479 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3480 OP(scan) == BOUNDL))
3485 STRLEN inclasslen = PL_regeol - locinput;
3487 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3489 if (locinput >= PL_regeol)
3491 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3492 nextchr = UCHARAT(locinput);
3497 nextchr = UCHARAT(locinput);
3498 if (!REGINCLASS(rex, scan, (U8*)locinput))
3500 if (!nextchr && locinput >= PL_regeol)
3502 nextchr = UCHARAT(++locinput);
3506 /* If we might have the case of the German sharp s
3507 * in a casefolding Unicode character class. */
3509 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3510 locinput += SHARP_S_SKIP;
3511 nextchr = UCHARAT(locinput);
3516 /* Special char classes - The defines start on line 129 or so */
3517 CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3518 CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3520 CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3521 CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3523 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3524 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3527 if (locinput >= PL_regeol)
3530 LOAD_UTF8_CHARCLASS_MARK();
3531 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3533 locinput += PL_utf8skip[nextchr];
3534 while (locinput < PL_regeol &&
3535 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3536 locinput += UTF8SKIP(locinput);
3537 if (locinput > PL_regeol)
3542 nextchr = UCHARAT(locinput);
3549 PL_reg_flags |= RF_tainted;
3554 n = reg_check_named_buff_matched(rex,scan);
3557 type = REF + ( type - NREF );
3564 PL_reg_flags |= RF_tainted;
3568 n = ARG(scan); /* which paren pair */
3571 ln = PL_regoffs[n].start;
3572 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3573 if (*PL_reglastparen < n || ln == -1)
3574 sayNO; /* Do not match unless seen CLOSEn. */
3575 if (ln == PL_regoffs[n].end)
3579 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3581 const char *e = PL_bostr + PL_regoffs[n].end;
3583 * Note that we can't do the "other character" lookup trick as
3584 * in the 8-bit case (no pun intended) because in Unicode we
3585 * have to map both upper and title case to lower case.
3589 STRLEN ulen1, ulen2;
3590 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3591 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3595 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3596 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3597 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3604 nextchr = UCHARAT(locinput);
3608 /* Inline the first character, for speed. */
3609 if (UCHARAT(s) != nextchr &&
3611 (UCHARAT(s) != (type == REFF
3612 ? PL_fold : PL_fold_locale)[nextchr])))
3614 ln = PL_regoffs[n].end - ln;
3615 if (locinput + ln > PL_regeol)
3617 if (ln > 1 && (type == REF
3618 ? memNE(s, locinput, ln)
3620 ? ibcmp(s, locinput, ln)
3621 : ibcmp_locale(s, locinput, ln))))
3624 nextchr = UCHARAT(locinput);
3634 #define ST st->u.eval
3639 regexp_internal *rei;
3640 regnode *startpoint;
3643 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3644 if (cur_eval && cur_eval->locinput==locinput) {
3645 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3646 Perl_croak(aTHX_ "Infinite recursion in regex");
3647 if ( ++nochange_depth > max_nochange_depth )
3649 "Pattern subroutine nesting without pos change"
3650 " exceeded limit in regex");
3657 (void)ReREFCNT_inc(rex_sv);
3658 if (OP(scan)==GOSUB) {
3659 startpoint = scan + ARG2L(scan);
3660 ST.close_paren = ARG(scan);
3662 startpoint = rei->program+1;
3665 goto eval_recurse_doit;
3667 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3668 if (cur_eval && cur_eval->locinput==locinput) {
3669 if ( ++nochange_depth > max_nochange_depth )
3670 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3675 /* execute the code in the {...} */
3677 SV ** const before = SP;
3678 OP_4tree * const oop = PL_op;
3679 COP * const ocurcop = PL_curcop;
3681 char *saved_regeol = PL_regeol;
3684 PL_op = (OP_4tree*)rexi->data->data[n];
3685 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3686 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3687 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3688 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3691 SV *sv_mrk = get_sv("REGMARK", 1);
3692 sv_setsv(sv_mrk, sv_yes_mark);
3695 CALLRUNOPS(aTHX); /* Scalar context. */
3698 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3705 PAD_RESTORE_LOCAL(old_comppad);
3706 PL_curcop = ocurcop;
3707 PL_regeol = saved_regeol;
3710 sv_setsv(save_scalar(PL_replgv), ret);
3714 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3717 /* extract RE object from returned value; compiling if
3723 SV *const sv = SvRV(ret);
3725 if (SvTYPE(sv) == SVt_REGEXP) {
3727 } else if (SvSMAGICAL(sv)) {
3728 mg = mg_find(sv, PERL_MAGIC_qr);
3731 } else if (SvTYPE(ret) == SVt_REGEXP) {
3733 } else if (SvSMAGICAL(ret)) {
3734 if (SvGMAGICAL(ret)) {
3735 /* I don't believe that there is ever qr magic
3737 assert(!mg_find(ret, PERL_MAGIC_qr));
3738 sv_unmagic(ret, PERL_MAGIC_qr);
3741 mg = mg_find(ret, PERL_MAGIC_qr);
3742 /* testing suggests mg only ends up non-NULL for
3743 scalars who were upgraded and compiled in the
3744 else block below. In turn, this is only
3745 triggered in the "postponed utf8 string" tests
3751 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3755 rx = reg_temp_copy(NULL, rx);
3759 const I32 osize = PL_regsize;
3762 assert (SvUTF8(ret));
3763 } else if (SvUTF8(ret)) {
3764 /* Not doing UTF-8, despite what the SV says. Is
3765 this only if we're trapped in use 'bytes'? */
3766 /* Make a copy of the octet sequence, but without
3767 the flag on, as the compiler now honours the
3768 SvUTF8 flag on ret. */
3770 const char *const p = SvPV(ret, len);
3771 ret = newSVpvn_flags(p, len, SVs_TEMP);
3773 rx = CALLREGCOMP(ret, pm_flags);
3775 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3777 /* This isn't a first class regexp. Instead, it's
3778 caching a regexp onto an existing, Perl visible
3780 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3785 re = (struct regexp *)SvANY(rx);
3787 RXp_MATCH_COPIED_off(re);
3788 re->subbeg = rex->subbeg;
3789 re->sublen = rex->sublen;
3792 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
3793 "Matching embedded");
3795 startpoint = rei->program + 1;
3796 ST.close_paren = 0; /* only used for GOSUB */
3797 /* borrowed from regtry */
3798 if (PL_reg_start_tmpl <= re->nparens) {
3799 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3800 if(PL_reg_start_tmp)
3801 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3803 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3806 eval_recurse_doit: /* Share code with GOSUB below this line */
3807 /* run the pattern returned from (??{...}) */
3808 ST.cp = regcppush(0); /* Save *all* the positions. */
3809 REGCP_SET(ST.lastcp);
3811 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3813 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3814 PL_reglastparen = &re->lastparen;
3815 PL_reglastcloseparen = &re->lastcloseparen;
3817 re->lastcloseparen = 0;
3819 PL_reginput = locinput;
3822 /* XXXX This is too dramatic a measure... */
3825 ST.toggle_reg_flags = PL_reg_flags;
3827 PL_reg_flags |= RF_utf8;
3829 PL_reg_flags &= ~RF_utf8;
3830 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3832 ST.prev_rex = rex_sv;
3833 ST.prev_curlyx = cur_curlyx;
3834 SETREX(rex_sv,re_sv);
3839 ST.prev_eval = cur_eval;
3841 /* now continue from first node in postoned RE */
3842 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3845 /* logical is 1, /(?(?{...})X|Y)/ */
3846 sw = (bool)SvTRUE(ret);
3851 case EVAL_AB: /* cleanup after a successful (??{A})B */
3852 /* note: this is called twice; first after popping B, then A */
3853 PL_reg_flags ^= ST.toggle_reg_flags;
3854 ReREFCNT_dec(rex_sv);
3855 SETREX(rex_sv,ST.prev_rex);
3856 rex = (struct regexp *)SvANY(rex_sv);
3857 rexi = RXi_GET(rex);
3859 cur_eval = ST.prev_eval;
3860 cur_curlyx = ST.prev_curlyx;
3862 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
3863 PL_reglastparen = &rex->lastparen;
3864 PL_reglastcloseparen = &rex->lastcloseparen;
3865 /* also update PL_regoffs */
3866 PL_regoffs = rex->offs;
3868 /* XXXX This is too dramatic a measure... */
3870 if ( nochange_depth )
3875 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3876 /* note: this is called twice; first after popping B, then A */
3877 PL_reg_flags ^= ST.toggle_reg_flags;
3878 ReREFCNT_dec(rex_sv);
3879 SETREX(rex_sv,ST.prev_rex);
3880 rex = (struct regexp *)SvANY(rex_sv);
3881 rexi = RXi_GET(rex);
3882 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
3883 PL_reglastparen = &rex->lastparen;
3884 PL_reglastcloseparen = &rex->lastcloseparen;
3886 PL_reginput = locinput;
3887 REGCP_UNWIND(ST.lastcp);
3889 cur_eval = ST.prev_eval;
3890 cur_curlyx = ST.prev_curlyx;
3891 /* XXXX This is too dramatic a measure... */
3893 if ( nochange_depth )
3899 n = ARG(scan); /* which paren pair */
3900 PL_reg_start_tmp[n] = locinput;
3906 n = ARG(scan); /* which paren pair */
3907 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3908 PL_regoffs[n].end = locinput - PL_bostr;
3909 /*if (n > PL_regsize)
3911 if (n > *PL_reglastparen)
3912 *PL_reglastparen = n;
3913 *PL_reglastcloseparen = n;
3914 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3922 cursor && OP(cursor)!=END;
3923 cursor=regnext(cursor))
3925 if ( OP(cursor)==CLOSE ){
3927 if ( n <= lastopen ) {
3929 = PL_reg_start_tmp[n] - PL_bostr;
3930 PL_regoffs[n].end = locinput - PL_bostr;
3931 /*if (n > PL_regsize)
3933 if (n > *PL_reglastparen)
3934 *PL_reglastparen = n;
3935 *PL_reglastcloseparen = n;
3936 if ( n == ARG(scan) || (cur_eval &&
3937 cur_eval->u.eval.close_paren == n))
3946 n = ARG(scan); /* which paren pair */
3947 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3950 /* reg_check_named_buff_matched returns 0 for no match */
3951 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3955 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3961 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3963 next = NEXTOPER(NEXTOPER(scan));
3965 next = scan + ARG(scan);
3966 if (OP(next) == IFTHEN) /* Fake one. */
3967 next = NEXTOPER(NEXTOPER(next));
3971 logical = scan->flags;
3974 /*******************************************************************
3976 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3977 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3978 STAR/PLUS/CURLY/CURLYN are used instead.)
3980 A*B is compiled as <CURLYX><A><WHILEM><B>
3982 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3983 state, which contains the current count, initialised to -1. It also sets
3984 cur_curlyx to point to this state, with any previous value saved in the
3987 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3988 since the pattern may possibly match zero times (i.e. it's a while {} loop
3989 rather than a do {} while loop).
3991 Each entry to WHILEM represents a successful match of A. The count in the
3992 CURLYX block is incremented, another WHILEM state is pushed, and execution
3993 passes to A or B depending on greediness and the current count.
3995 For example, if matching against the string a1a2a3b (where the aN are
3996 substrings that match /A/), then the match progresses as follows: (the
3997 pushed states are interspersed with the bits of strings matched so far):
4000 <CURLYX cnt=0><WHILEM>
4001 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4002 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4003 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4004 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4006 (Contrast this with something like CURLYM, which maintains only a single
4010 a1 <CURLYM cnt=1> a2
4011 a1 a2 <CURLYM cnt=2> a3
4012 a1 a2 a3 <CURLYM cnt=3> b
4015 Each WHILEM state block marks a point to backtrack to upon partial failure
4016 of A or B, and also contains some minor state data related to that
4017 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4018 overall state, such as the count, and pointers to the A and B ops.
4020 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4021 must always point to the *current* CURLYX block, the rules are:
4023 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4024 and set cur_curlyx to point the new block.
4026 When popping the CURLYX block after a successful or unsuccessful match,
4027 restore the previous cur_curlyx.
4029 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4030 to the outer one saved in the CURLYX block.
4032 When popping the WHILEM block after a successful or unsuccessful B match,
4033 restore the previous cur_curlyx.
4035 Here's an example for the pattern (AI* BI)*BO
4036 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4039 curlyx backtrack stack
4040 ------ ---------------
4042 CO <CO prev=NULL> <WO>
4043 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4044 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4045 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4047 At this point the pattern succeeds, and we work back down the stack to
4048 clean up, restoring as we go:
4050 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4051 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4052 CO <CO prev=NULL> <WO>
4055 *******************************************************************/
4057 #define ST st->u.curlyx
4059 case CURLYX: /* start of /A*B/ (for complex A) */
4061 /* No need to save/restore up to this paren */
4062 I32 parenfloor = scan->flags;
4064 assert(next); /* keep Coverity happy */
4065 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4068 /* XXXX Probably it is better to teach regpush to support
4069 parenfloor > PL_regsize... */
4070 if (parenfloor > (I32)*PL_reglastparen)
4071 parenfloor = *PL_reglastparen; /* Pessimization... */
4073 ST.prev_curlyx= cur_curlyx;
4075 ST.cp = PL_savestack_ix;
4077 /* these fields contain the state of the current curly.
4078 * they are accessed by subsequent WHILEMs */
4079 ST.parenfloor = parenfloor;
4080 ST.min = ARG1(scan);
4081 ST.max = ARG2(scan);
4082 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4086 ST.count = -1; /* this will be updated by WHILEM */
4087 ST.lastloc = NULL; /* this will be updated by WHILEM */
4089 PL_reginput = locinput;
4090 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4094 case CURLYX_end: /* just finished matching all of A*B */
4095 cur_curlyx = ST.prev_curlyx;
4099 case CURLYX_end_fail: /* just failed to match all of A*B */
4101 cur_curlyx = ST.prev_curlyx;
4107 #define ST st->u.whilem
4109 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4111 /* see the discussion above about CURLYX/WHILEM */
4113 assert(cur_curlyx); /* keep Coverity happy */
4114 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4115 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4116 ST.cache_offset = 0;
4119 PL_reginput = locinput;
4121 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4122 "%*s whilem: matched %ld out of %ld..%ld\n",
4123 REPORT_CODE_OFF+depth*2, "", (long)n,
4124 (long)cur_curlyx->u.curlyx.min,
4125 (long)cur_curlyx->u.curlyx.max)
4128 /* First just match a string of min A's. */
4130 if (n < cur_curlyx->u.curlyx.min) {
4131 cur_curlyx->u.curlyx.lastloc = locinput;
4132 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4136 /* If degenerate A matches "", assume A done. */
4138 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4139 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4140 "%*s whilem: empty match detected, trying continuation...\n",
4141 REPORT_CODE_OFF+depth*2, "")
4143 goto do_whilem_B_max;
4146 /* super-linear cache processing */
4150 if (!PL_reg_maxiter) {
4151 /* start the countdown: Postpone detection until we
4152 * know the match is not *that* much linear. */
4153 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4154 /* possible overflow for long strings and many CURLYX's */
4155 if (PL_reg_maxiter < 0)
4156 PL_reg_maxiter = I32_MAX;
4157 PL_reg_leftiter = PL_reg_maxiter;
4160 if (PL_reg_leftiter-- == 0) {
4161 /* initialise cache */
4162 const I32 size = (PL_reg_maxiter + 7)/8;
4163 if (PL_reg_poscache) {
4164 if ((I32)PL_reg_poscache_size < size) {
4165 Renew(PL_reg_poscache, size, char);
4166 PL_reg_poscache_size = size;
4168 Zero(PL_reg_poscache, size, char);
4171 PL_reg_poscache_size = size;
4172 Newxz(PL_reg_poscache, size, char);
4174 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4175 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4176 PL_colors[4], PL_colors[5])
4180 if (PL_reg_leftiter < 0) {
4181 /* have we already failed at this position? */
4183 offset = (scan->flags & 0xf) - 1
4184 + (locinput - PL_bostr) * (scan->flags>>4);
4185 mask = 1 << (offset % 8);
4187 if (PL_reg_poscache[offset] & mask) {
4188 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4189 "%*s whilem: (cache) already tried at this position...\n",
4190 REPORT_CODE_OFF+depth*2, "")
4192 sayNO; /* cache records failure */
4194 ST.cache_offset = offset;
4195 ST.cache_mask = mask;
4199 /* Prefer B over A for minimal matching. */
4201 if (cur_curlyx->u.curlyx.minmod) {
4202 ST.save_curlyx = cur_curlyx;
4203 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4204 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4205 REGCP_SET(ST.lastcp);
4206 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4210 /* Prefer A over B for maximal matching. */
4212 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4213 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4214 cur_curlyx->u.curlyx.lastloc = locinput;
4215 REGCP_SET(ST.lastcp);
4216 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4219 goto do_whilem_B_max;
4223 case WHILEM_B_min: /* just matched B in a minimal match */
4224 case WHILEM_B_max: /* just matched B in a maximal match */
4225 cur_curlyx = ST.save_curlyx;
4229 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4230 cur_curlyx = ST.save_curlyx;
4231 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4232 cur_curlyx->u.curlyx.count--;
4236 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4237 REGCP_UNWIND(ST.lastcp);
4240 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4241 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4242 cur_curlyx->u.curlyx.count--;
4246 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4247 REGCP_UNWIND(ST.lastcp);
4248 regcppop(rex); /* Restore some previous $<digit>s? */
4249 PL_reginput = locinput;
4250 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4251 "%*s whilem: failed, trying continuation...\n",
4252 REPORT_CODE_OFF+depth*2, "")
4255 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4256 && ckWARN(WARN_REGEXP)
4257 && !(PL_reg_flags & RF_warned))
4259 PL_reg_flags |= RF_warned;
4260 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4261 "Complex regular subexpression recursion",
4266 ST.save_curlyx = cur_curlyx;
4267 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4268 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4271 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4272 cur_curlyx = ST.save_curlyx;
4273 REGCP_UNWIND(ST.lastcp);
4276 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4277 /* Maximum greed exceeded */
4278 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4279 && ckWARN(WARN_REGEXP)
4280 && !(PL_reg_flags & RF_warned))
4282 PL_reg_flags |= RF_warned;
4283 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4284 "%s limit (%d) exceeded",
4285 "Complex regular subexpression recursion",
4288 cur_curlyx->u.curlyx.count--;
4292 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4293 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4295 /* Try grabbing another A and see if it helps. */
4296 PL_reginput = locinput;
4297 cur_curlyx->u.curlyx.lastloc = locinput;
4298 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4299 REGCP_SET(ST.lastcp);
4300 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4304 #define ST st->u.branch
4306 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4307 next = scan + ARG(scan);
4310 scan = NEXTOPER(scan);
4313 case BRANCH: /* /(...|A|...)/ */
4314 scan = NEXTOPER(scan); /* scan now points to inner node */
4315 ST.lastparen = *PL_reglastparen;
4316 ST.next_branch = next;
4318 PL_reginput = locinput;
4320 /* Now go into the branch */
4322 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4324 PUSH_STATE_GOTO(BRANCH_next, scan);
4328 PL_reginput = locinput;
4329 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4330 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4331 PUSH_STATE_GOTO(CUTGROUP_next,next);
4333 case CUTGROUP_next_fail:
4336 if (st->u.mark.mark_name)
4337 sv_commit = st->u.mark.mark_name;
4343 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4348 REGCP_UNWIND(ST.cp);
4349 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4350 PL_regoffs[n].end = -1;
4351 *PL_reglastparen = n;
4352 /*dmq: *PL_reglastcloseparen = n; */
4353 scan = ST.next_branch;
4354 /* no more branches? */
4355 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4357 PerlIO_printf( Perl_debug_log,
4358 "%*s %sBRANCH failed...%s\n",
4359 REPORT_CODE_OFF+depth*2, "",
4365 continue; /* execute next BRANCH[J] op */
4373 #define ST st->u.curlym
4375 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4377 /* This is an optimisation of CURLYX that enables us to push
4378 * only a single backtracking state, no matter how many matches
4379 * there are in {m,n}. It relies on the pattern being constant
4380 * length, with no parens to influence future backrefs
4384 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4386 /* if paren positive, emulate an OPEN/CLOSE around A */
4388 U32 paren = ST.me->flags;
4389 if (paren > PL_regsize)
4391 if (paren > *PL_reglastparen)
4392 *PL_reglastparen = paren;
4393 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4401 ST.c1 = CHRTEST_UNINIT;
4404 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4407 curlym_do_A: /* execute the A in /A{m,n}B/ */
4408 PL_reginput = locinput;
4409 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4412 case CURLYM_A: /* we've just matched an A */
4413 locinput = st->locinput;
4414 nextchr = UCHARAT(locinput);
4417 /* after first match, determine A's length: u.curlym.alen */
4418 if (ST.count == 1) {
4419 if (PL_reg_match_utf8) {
4421 while (s < PL_reginput) {
4427 ST.alen = PL_reginput - locinput;
4430 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4433 PerlIO_printf(Perl_debug_log,
4434 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4435 (int)(REPORT_CODE_OFF+(depth*2)), "",
4436 (IV) ST.count, (IV)ST.alen)
4439 locinput = PL_reginput;
4441 if (cur_eval && cur_eval->u.eval.close_paren &&
4442 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4446 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4447 if ( max == REG_INFTY || ST.count < max )
4448 goto curlym_do_A; /* try to match another A */
4450 goto curlym_do_B; /* try to match B */
4452 case CURLYM_A_fail: /* just failed to match an A */
4453 REGCP_UNWIND(ST.cp);
4455 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4456 || (cur_eval && cur_eval->u.eval.close_paren &&
4457 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4460 curlym_do_B: /* execute the B in /A{m,n}B/ */
4461 PL_reginput = locinput;
4462 if (ST.c1 == CHRTEST_UNINIT) {
4463 /* calculate c1 and c2 for possible match of 1st char
4464 * following curly */
4465 ST.c1 = ST.c2 = CHRTEST_VOID;
4466 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4467 regnode *text_node = ST.B;
4468 if (! HAS_TEXT(text_node))
4469 FIND_NEXT_IMPT(text_node);
4472 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4474 But the former is redundant in light of the latter.
4476 if this changes back then the macro for
4477 IS_TEXT and friends need to change.
4479 if (PL_regkind[OP(text_node)] == EXACT)
4482 ST.c1 = (U8)*STRING(text_node);
4484 (IS_TEXTF(text_node))
4486 : (IS_TEXTFL(text_node))
4487 ? PL_fold_locale[ST.c1]
4494 PerlIO_printf(Perl_debug_log,
4495 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4496 (int)(REPORT_CODE_OFF+(depth*2)),
4499 if (ST.c1 != CHRTEST_VOID
4500 && UCHARAT(PL_reginput) != ST.c1
4501 && UCHARAT(PL_reginput) != ST.c2)
4503 /* simulate B failing */
4505 PerlIO_printf(Perl_debug_log,
4506 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4507 (int)(REPORT_CODE_OFF+(depth*2)),"",
4510 state_num = CURLYM_B_fail;
4511 goto reenter_switch;
4515 /* mark current A as captured */
4516 I32 paren = ST.me->flags;
4518 PL_regoffs[paren].start
4519 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4520 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4521 /*dmq: *PL_reglastcloseparen = paren; */
4524 PL_regoffs[paren].end = -1;
4525 if (cur_eval && cur_eval->u.eval.close_paren &&
4526 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4535 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4538 case CURLYM_B_fail: /* just failed to match a B */
4539 REGCP_UNWIND(ST.cp);
4541 I32 max = ARG2(ST.me);
4542 if (max != REG_INFTY && ST.count == max)
4544 goto curlym_do_A; /* try to match a further A */
4546 /* backtrack one A */
4547 if (ST.count == ARG1(ST.me) /* min */)
4550 locinput = HOPc(locinput, -ST.alen);
4551 goto curlym_do_B; /* try to match B */
4554 #define ST st->u.curly
4556 #define CURLY_SETPAREN(paren, success) \
4559 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4560 PL_regoffs[paren].end = locinput - PL_bostr; \
4561 *PL_reglastcloseparen = paren; \
4564 PL_regoffs[paren].end = -1; \
4567 case STAR: /* /A*B/ where A is width 1 */
4571 scan = NEXTOPER(scan);
4573 case PLUS: /* /A+B/ where A is width 1 */
4577 scan = NEXTOPER(scan);
4579 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4580 ST.paren = scan->flags; /* Which paren to set */
4581 if (ST.paren > PL_regsize)
4582 PL_regsize = ST.paren;
4583 if (ST.paren > *PL_reglastparen)
4584 *PL_reglastparen = ST.paren;
4585 ST.min = ARG1(scan); /* min to match */
4586 ST.max = ARG2(scan); /* max to match */
4587 if (cur_eval && cur_eval->u.eval.close_paren &&
4588 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4592 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4594 case CURLY: /* /A{m,n}B/ where A is width 1 */
4596 ST.min = ARG1(scan); /* min to match */
4597 ST.max = ARG2(scan); /* max to match */
4598 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4601 * Lookahead to avoid useless match attempts
4602 * when we know what character comes next.
4604 * Used to only do .*x and .*?x, but now it allows
4605 * for )'s, ('s and (?{ ... })'s to be in the way
4606 * of the quantifier and the EXACT-like node. -- japhy
4609 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4611 if (HAS_TEXT(next) || JUMPABLE(next)) {
4613 regnode *text_node = next;
4615 if (! HAS_TEXT(text_node))
4616 FIND_NEXT_IMPT(text_node);
4618 if (! HAS_TEXT(text_node))
4619 ST.c1 = ST.c2 = CHRTEST_VOID;
4621 if ( PL_regkind[OP(text_node)] != EXACT ) {
4622 ST.c1 = ST.c2 = CHRTEST_VOID;
4623 goto assume_ok_easy;
4626 s = (U8*)STRING(text_node);
4628 /* Currently we only get here when
4630 PL_rekind[OP(text_node)] == EXACT
4632 if this changes back then the macro for IS_TEXT and
4633 friends need to change. */
4636 if (IS_TEXTF(text_node))
4637 ST.c2 = PL_fold[ST.c1];
4638 else if (IS_TEXTFL(text_node))
4639 ST.c2 = PL_fold_locale[ST.c1];
4642 if (IS_TEXTF(text_node)) {
4643 STRLEN ulen1, ulen2;
4644 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4645 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4647 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4648 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4650 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4652 0 : UTF8_ALLOW_ANY);
4653 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4655 0 : UTF8_ALLOW_ANY);
4657 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4659 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4664 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4671 ST.c1 = ST.c2 = CHRTEST_VOID;
4676 PL_reginput = locinput;
4679 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4682 locinput = PL_reginput;
4684 if (ST.c1 == CHRTEST_VOID)
4685 goto curly_try_B_min;
4687 ST.oldloc = locinput;
4689 /* set ST.maxpos to the furthest point along the
4690 * string that could possibly match */
4691 if (ST.max == REG_INFTY) {
4692 ST.maxpos = PL_regeol - 1;
4694 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4698 int m = ST.max - ST.min;
4699 for (ST.maxpos = locinput;
4700 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4701 ST.maxpos += UTF8SKIP(ST.maxpos);
4704 ST.maxpos = locinput + ST.max - ST.min;
4705 if (ST.maxpos >= PL_regeol)
4706 ST.maxpos = PL_regeol - 1;
4708 goto curly_try_B_min_known;
4712 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4713 locinput = PL_reginput;
4714 if (ST.count < ST.min)
4716 if ((ST.count > ST.min)
4717 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4719 /* A{m,n} must come at the end of the string, there's
4720 * no point in backing off ... */
4722 /* ...except that $ and \Z can match before *and* after
4723 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4724 We may back off by one in this case. */
4725 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4729 goto curly_try_B_max;
4734 case CURLY_B_min_known_fail:
4735 /* failed to find B in a non-greedy match where c1,c2 valid */
4736 if (ST.paren && ST.count)
4737 PL_regoffs[ST.paren].end = -1;
4739 PL_reginput = locinput; /* Could be reset... */
4740 REGCP_UNWIND(ST.cp);
4741 /* Couldn't or didn't -- move forward. */
4742 ST.oldloc = locinput;
4744 locinput += UTF8SKIP(locinput);
4748 curly_try_B_min_known:
4749 /* find the next place where 'B' could work, then call B */
4753 n = (ST.oldloc == locinput) ? 0 : 1;
4754 if (ST.c1 == ST.c2) {
4756 /* set n to utf8_distance(oldloc, locinput) */
4757 while (locinput <= ST.maxpos &&
4758 utf8n_to_uvchr((U8*)locinput,
4759 UTF8_MAXBYTES, &len,
4760 uniflags) != (UV)ST.c1) {
4766 /* set n to utf8_distance(oldloc, locinput) */
4767 while (locinput <= ST.maxpos) {
4769 const UV c = utf8n_to_uvchr((U8*)locinput,
4770 UTF8_MAXBYTES, &len,
4772 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4780 if (ST.c1 == ST.c2) {
4781 while (locinput <= ST.maxpos &&
4782 UCHARAT(locinput) != ST.c1)
4786 while (locinput <= ST.maxpos
4787 && UCHARAT(locinput) != ST.c1
4788 && UCHARAT(locinput) != ST.c2)
4791 n = locinput - ST.oldloc;
4793 if (locinput > ST.maxpos)
4795 /* PL_reginput == oldloc now */
4798 if (regrepeat(rex, ST.A, n, depth) < n)
4801 PL_reginput = locinput;
4802 CURLY_SETPAREN(ST.paren, ST.count);
4803 if (cur_eval && cur_eval->u.eval.close_paren &&
4804 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4807 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4812 case CURLY_B_min_fail:
4813 /* failed to find B in a non-greedy match where c1,c2 invalid */
4814 if (ST.paren && ST.count)
4815 PL_regoffs[ST.paren].end = -1;
4817 REGCP_UNWIND(ST.cp);
4818 /* failed -- move forward one */
4819 PL_reginput = locinput;
4820 if (regrepeat(rex, ST.A, 1, depth)) {
4822 locinput = PL_reginput;
4823 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4824 ST.count > 0)) /* count overflow ? */
4827 CURLY_SETPAREN(ST.paren, ST.count);
4828 if (cur_eval && cur_eval->u.eval.close_paren &&
4829 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4832 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4840 /* a successful greedy match: now try to match B */
4841 if (cur_eval && cur_eval->u.eval.close_paren &&
4842 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4847 if (ST.c1 != CHRTEST_VOID)
4848 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4849 UTF8_MAXBYTES, 0, uniflags)
4850 : (UV) UCHARAT(PL_reginput);
4851 /* If it could work, try it. */
4852 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4853 CURLY_SETPAREN(ST.paren, ST.count);
4854 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4859 case CURLY_B_max_fail:
4860 /* failed to find B in a greedy match */
4861 if (ST.paren && ST.count)
4862 PL_regoffs[ST.paren].end = -1;
4864 REGCP_UNWIND(ST.cp);
4866 if (--ST.count < ST.min)
4868 PL_reginput = locinput = HOPc(locinput, -1);
4869 goto curly_try_B_max;
4876 /* we've just finished A in /(??{A})B/; now continue with B */
4878 st->u.eval.toggle_reg_flags
4879 = cur_eval->u.eval.toggle_reg_flags;
4880 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4882 st->u.eval.prev_rex = rex_sv; /* inner */
4883 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4884 rex = (struct regexp *)SvANY(rex_sv);
4885 rexi = RXi_GET(rex);
4886 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4887 ReREFCNT_inc(rex_sv);
4888 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4890 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4891 PL_reglastparen = &rex->lastparen;
4892 PL_reglastcloseparen = &rex->lastcloseparen;
4894 REGCP_SET(st->u.eval.lastcp);
4895 PL_reginput = locinput;
4897 /* Restore parens of the outer rex without popping the
4899 tmpix = PL_savestack_ix;
4900 PL_savestack_ix = cur_eval->u.eval.lastcp;
4902 PL_savestack_ix = tmpix;
4904 st->u.eval.prev_eval = cur_eval;
4905 cur_eval = cur_eval->u.eval.prev_eval;
4907 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4908 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4909 if ( nochange_depth )
4912 PUSH_YES_STATE_GOTO(EVAL_AB,
4913 st->u.eval.prev_eval->u.eval.B); /* match B */
4916 if (locinput < reginfo->till) {
4917 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4918 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4920 (long)(locinput - PL_reg_starttry),
4921 (long)(reginfo->till - PL_reg_starttry),
4924 sayNO_SILENT; /* Cannot match: too short. */
4926 PL_reginput = locinput; /* put where regtry can find it */
4927 sayYES; /* Success! */
4929 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4931 PerlIO_printf(Perl_debug_log,
4932 "%*s %ssubpattern success...%s\n",
4933 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4934 PL_reginput = locinput; /* put where regtry can find it */
4935 sayYES; /* Success! */
4938 #define ST st->u.ifmatch
4940 case SUSPEND: /* (?>A) */
4942 PL_reginput = locinput;
4945 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4947 goto ifmatch_trivial_fail_test;
4949 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4951 ifmatch_trivial_fail_test:
4953 char * const s = HOPBACKc(locinput, scan->flags);
4958 sw = 1 - (bool)ST.wanted;
4962 next = scan + ARG(scan);
4970 PL_reginput = locinput;
4974 ST.logical = logical;
4975 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
4977 /* execute body of (?...A) */
4978 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4981 case IFMATCH_A_fail: /* body of (?...A) failed */
4982 ST.wanted = !ST.wanted;
4985 case IFMATCH_A: /* body of (?...A) succeeded */
4987 sw = (bool)ST.wanted;
4989 else if (!ST.wanted)
4992 if (OP(ST.me) == SUSPEND)
4993 locinput = PL_reginput;
4995 locinput = PL_reginput = st->locinput;
4996 nextchr = UCHARAT(locinput);
4998 scan = ST.me + ARG(ST.me);
5001 continue; /* execute B */
5006 next = scan + ARG(scan);
5011 reginfo->cutpoint = PL_regeol;
5014 PL_reginput = locinput;
5016 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5017 PUSH_STATE_GOTO(COMMIT_next,next);
5019 case COMMIT_next_fail:
5026 #define ST st->u.mark
5028 ST.prev_mark = mark_state;
5029 ST.mark_name = sv_commit = sv_yes_mark
5030 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5032 ST.mark_loc = PL_reginput = locinput;
5033 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5035 case MARKPOINT_next:
5036 mark_state = ST.prev_mark;
5039 case MARKPOINT_next_fail:
5040 if (popmark && sv_eq(ST.mark_name,popmark))
5042 if (ST.mark_loc > startpoint)
5043 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5044 popmark = NULL; /* we found our mark */
5045 sv_commit = ST.mark_name;
5048 PerlIO_printf(Perl_debug_log,
5049 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5050 REPORT_CODE_OFF+depth*2, "",
5051 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5054 mark_state = ST.prev_mark;
5055 sv_yes_mark = mark_state ?
5056 mark_state->u.mark.mark_name : NULL;
5060 PL_reginput = locinput;
5062 /* (*SKIP) : if we fail we cut here*/
5063 ST.mark_name = NULL;
5064 ST.mark_loc = locinput;
5065 PUSH_STATE_GOTO(SKIP_next,next);
5067 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5068 otherwise do nothing. Meaning we need to scan
5070 regmatch_state *cur = mark_state;
5071 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5074 if ( sv_eq( cur->u.mark.mark_name,
5077 ST.mark_name = find;
5078 PUSH_STATE_GOTO( SKIP_next, next );
5080 cur = cur->u.mark.prev_mark;
5083 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5085 case SKIP_next_fail:
5087 /* (*CUT:NAME) - Set up to search for the name as we
5088 collapse the stack*/
5089 popmark = ST.mark_name;
5091 /* (*CUT) - No name, we cut here.*/
5092 if (ST.mark_loc > startpoint)
5093 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5094 /* but we set sv_commit to latest mark_name if there
5095 is one so they can test to see how things lead to this
5098 sv_commit=mark_state->u.mark.mark_name;
5106 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5108 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5111 U8 folded[UTF8_MAXBYTES_CASE+1];
5113 const char * const l = locinput;
5114 char *e = PL_regeol;
5115 to_uni_fold(n, folded, &foldlen);
5117 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5118 l, &e, 0, do_utf8)) {
5123 nextchr = UCHARAT(locinput);
5126 if ((n=is_LNBREAK(locinput,do_utf8))) {
5128 nextchr = UCHARAT(locinput);
5133 #define CASE_CLASS(nAmE) \
5135 if ((n=is_##nAmE(locinput,do_utf8))) { \
5137 nextchr = UCHARAT(locinput); \
5142 if ((n=is_##nAmE(locinput,do_utf8))) { \
5145 locinput += UTF8SKIP(locinput); \
5146 nextchr = UCHARAT(locinput); \
5151 CASE_CLASS(HORIZWS);
5155 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5156 PTR2UV(scan), OP(scan));
5157 Perl_croak(aTHX_ "regexp memory corruption");
5161 /* switch break jumps here */
5162 scan = next; /* prepare to execute the next op and ... */
5163 continue; /* ... jump back to the top, reusing st */
5167 /* push a state that backtracks on success */
5168 st->u.yes.prev_yes_state = yes_state;
5172 /* push a new regex state, then continue at scan */
5174 regmatch_state *newst;
5177 regmatch_state *cur = st;
5178 regmatch_state *curyes = yes_state;
5180 regmatch_slab *slab = PL_regmatch_slab;
5181 for (;curd > -1;cur--,curd--) {
5182 if (cur < SLAB_FIRST(slab)) {
5184 cur = SLAB_LAST(slab);
5186 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5187 REPORT_CODE_OFF + 2 + depth * 2,"",
5188 curd, PL_reg_name[cur->resume_state],
5189 (curyes == cur) ? "yes" : ""
5192 curyes = cur->u.yes.prev_yes_state;
5195 DEBUG_STATE_pp("push")
5198 st->locinput = locinput;
5200 if (newst > SLAB_LAST(PL_regmatch_slab))
5201 newst = S_push_slab(aTHX);
5202 PL_regmatch_state = newst;
5204 locinput = PL_reginput;
5205 nextchr = UCHARAT(locinput);
5213 * We get here only if there's trouble -- normally "case END" is
5214 * the terminating point.
5216 Perl_croak(aTHX_ "corrupted regexp pointers");
5222 /* we have successfully completed a subexpression, but we must now
5223 * pop to the state marked by yes_state and continue from there */
5224 assert(st != yes_state);
5226 while (st != yes_state) {
5228 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5229 PL_regmatch_slab = PL_regmatch_slab->prev;
5230 st = SLAB_LAST(PL_regmatch_slab);
5234 DEBUG_STATE_pp("pop (no final)");
5236 DEBUG_STATE_pp("pop (yes)");
5242 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5243 || yes_state > SLAB_LAST(PL_regmatch_slab))
5245 /* not in this slab, pop slab */
5246 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5247 PL_regmatch_slab = PL_regmatch_slab->prev;
5248 st = SLAB_LAST(PL_regmatch_slab);
5250 depth -= (st - yes_state);
5253 yes_state = st->u.yes.prev_yes_state;
5254 PL_regmatch_state = st;
5257 locinput= st->locinput;
5258 nextchr = UCHARAT(locinput);
5260 state_num = st->resume_state + no_final;
5261 goto reenter_switch;
5264 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5265 PL_colors[4], PL_colors[5]));
5267 if (PL_reg_eval_set) {
5268 /* each successfully executed (?{...}) block does the equivalent of
5269 * local $^R = do {...}
5270 * When popping the save stack, all these locals would be undone;
5271 * bypass this by setting the outermost saved $^R to the latest
5273 if (oreplsv != GvSV(PL_replgv))
5274 sv_setsv(oreplsv, GvSV(PL_replgv));
5281 PerlIO_printf(Perl_debug_log,
5282 "%*s %sfailed...%s\n",
5283 REPORT_CODE_OFF+depth*2, "",
5284 PL_colors[4], PL_colors[5])
5296 /* there's a previous state to backtrack to */
5298 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5299 PL_regmatch_slab = PL_regmatch_slab->prev;
5300 st = SLAB_LAST(PL_regmatch_slab);
5302 PL_regmatch_state = st;
5303 locinput= st->locinput;
5304 nextchr = UCHARAT(locinput);
5306 DEBUG_STATE_pp("pop");
5308 if (yes_state == st)
5309 yes_state = st->u.yes.prev_yes_state;
5311 state_num = st->resume_state + 1; /* failure = success + 1 */
5312 goto reenter_switch;
5317 if (rex->intflags & PREGf_VERBARG_SEEN) {
5318 SV *sv_err = get_sv("REGERROR", 1);
5319 SV *sv_mrk = get_sv("REGMARK", 1);
5321 sv_commit = &PL_sv_no;
5323 sv_yes_mark = &PL_sv_yes;
5326 sv_commit = &PL_sv_yes;
5327 sv_yes_mark = &PL_sv_no;
5329 sv_setsv(sv_err, sv_commit);
5330 sv_setsv(sv_mrk, sv_yes_mark);
5333 /* clean up; in particular, free all slabs above current one */
5334 LEAVE_SCOPE(oldsave);
5340 - regrepeat - repeatedly match something simple, report how many
5343 * [This routine now assumes that it will only match on things of length 1.
5344 * That was true before, but now we assume scan - reginput is the count,
5345 * rather than incrementing count on every character. [Er, except utf8.]]
5348 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5351 register char *scan;
5353 register char *loceol = PL_regeol;
5354 register I32 hardcount = 0;
5355 register bool do_utf8 = PL_reg_match_utf8;
5357 PERL_UNUSED_ARG(depth);
5360 PERL_ARGS_ASSERT_REGREPEAT;
5363 if (max == REG_INFTY)
5365 else if (max < loceol - scan)
5366 loceol = scan + max;
5371 while (scan < loceol && hardcount < max && *scan != '\n') {
5372 scan += UTF8SKIP(scan);
5376 while (scan < loceol && *scan != '\n')
5383 while (scan < loceol && hardcount < max) {
5384 scan += UTF8SKIP(scan);
5394 case EXACT: /* length of string is 1 */
5396 while (scan < loceol && UCHARAT(scan) == c)
5399 case EXACTF: /* length of string is 1 */
5401 while (scan < loceol &&
5402 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5405 case EXACTFL: /* length of string is 1 */
5406 PL_reg_flags |= RF_tainted;
5408 while (scan < loceol &&
5409 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5415 while (hardcount < max && scan < loceol &&
5416 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5417 scan += UTF8SKIP(scan);
5421 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5428 LOAD_UTF8_CHARCLASS_ALNUM();
5429 while (hardcount < max && scan < loceol &&
5430 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5431 scan += UTF8SKIP(scan);
5435 while (scan < loceol && isALNUM(*scan))
5440 PL_reg_flags |= RF_tainted;
5443 while (hardcount < max && scan < loceol &&
5444 isALNUM_LC_utf8((U8*)scan)) {
5445 scan += UTF8SKIP(scan);
5449 while (scan < loceol && isALNUM_LC(*scan))
5456 LOAD_UTF8_CHARCLASS_ALNUM();
5457 while (hardcount < max && scan < loceol &&
5458 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5459 scan += UTF8SKIP(scan);
5463 while (scan < loceol && !isALNUM(*scan))
5468 PL_reg_flags |= RF_tainted;
5471 while (hardcount < max && scan < loceol &&
5472 !isALNUM_LC_utf8((U8*)scan)) {
5473 scan += UTF8SKIP(scan);
5477 while (scan < loceol && !isALNUM_LC(*scan))
5484 LOAD_UTF8_CHARCLASS_SPACE();
5485 while (hardcount < max && scan < loceol &&
5487 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5488 scan += UTF8SKIP(scan);
5492 while (scan < loceol && isSPACE(*scan))
5497 PL_reg_flags |= RF_tainted;
5500 while (hardcount < max && scan < loceol &&
5501 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5502 scan += UTF8SKIP(scan);
5506 while (scan < loceol && isSPACE_LC(*scan))
5513 LOAD_UTF8_CHARCLASS_SPACE();
5514 while (hardcount < max && scan < loceol &&
5516 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5517 scan += UTF8SKIP(scan);
5521 while (scan < loceol && !isSPACE(*scan))
5526 PL_reg_flags |= RF_tainted;
5529 while (hardcount < max && scan < loceol &&
5530 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5531 scan += UTF8SKIP(scan);
5535 while (scan < loceol && !isSPACE_LC(*scan))
5542 LOAD_UTF8_CHARCLASS_DIGIT();
5543 while (hardcount < max && scan < loceol &&
5544 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5545 scan += UTF8SKIP(scan);
5549 while (scan < loceol && isDIGIT(*scan))
5556 LOAD_UTF8_CHARCLASS_DIGIT();
5557 while (hardcount < max && scan < loceol &&
5558 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5559 scan += UTF8SKIP(scan);
5563 while (scan < loceol && !isDIGIT(*scan))
5569 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5575 LNBREAK can match two latin chars, which is ok,
5576 because we have a null terminated string, but we
5577 have to use hardcount in this situation
5579 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5588 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5593 while (scan < loceol && is_HORIZWS_latin1(scan))
5600 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5601 scan += UTF8SKIP(scan);
5605 while (scan < loceol && !is_HORIZWS_latin1(scan))
5613 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5618 while (scan < loceol && is_VERTWS_latin1(scan))
5626 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5627 scan += UTF8SKIP(scan);
5631 while (scan < loceol && !is_VERTWS_latin1(scan))
5637 default: /* Called on something of 0 width. */
5638 break; /* So match right here or not at all. */
5644 c = scan - PL_reginput;
5648 GET_RE_DEBUG_FLAGS_DECL;
5650 SV * const prop = sv_newmortal();
5651 regprop(prog, prop, p);
5652 PerlIO_printf(Perl_debug_log,
5653 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5654 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5662 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5664 - regclass_swash - prepare the utf8 swash
5668 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5674 RXi_GET_DECL(prog,progi);
5675 const struct reg_data * const data = prog ? progi->data : NULL;
5677 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5679 if (data && data->count) {
5680 const U32 n = ARG(node);
5682 if (data->what[n] == 's') {
5683 SV * const rv = MUTABLE_SV(data->data[n]);
5684 AV * const av = MUTABLE_AV(SvRV(rv));
5685 SV **const ary = AvARRAY(av);
5688 /* See the end of regcomp.c:S_regclass() for
5689 * documentation of these array elements. */
5692 a = SvROK(ary[1]) ? &ary[1] : NULL;
5693 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5697 else if (si && doinit) {
5698 sw = swash_init("utf8", "", si, 1, 0);
5699 (void)av_store(av, 1, sw);
5716 - reginclass - determine if a character falls into a character class
5718 The n is the ANYOF regnode, the p is the target string, lenp
5719 is pointer to the maximum length of how far to go in the p
5720 (if the lenp is zero, UTF8SKIP(p) is used),
5721 do_utf8 tells whether the target string is in UTF-8.
5726 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5729 const char flags = ANYOF_FLAGS(n);
5735 PERL_ARGS_ASSERT_REGINCLASS;
5737 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5738 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5739 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5740 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5741 if (len == (STRLEN)-1)
5742 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5745 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5746 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5749 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5750 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5753 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5757 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5765 utf8_p = bytes_to_utf8(p, &len);
5767 if (swash_fetch(sw, utf8_p, 1))
5769 else if (flags & ANYOF_FOLD) {
5770 if (!match && lenp && av) {
5772 for (i = 0; i <= av_len(av); i++) {
5773 SV* const sv = *av_fetch(av, i, FALSE);
5775 const char * const s = SvPV_const(sv, len);
5776 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5784 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5787 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
5788 if (swash_fetch(sw, tmpbuf, 1))
5793 /* If we allocated a string above, free it */
5794 if (! do_utf8) Safefree(utf8_p);
5797 if (match && lenp && *lenp == 0)
5798 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5800 if (!match && c < 256) {
5801 if (ANYOF_BITMAP_TEST(n, c))
5803 else if (flags & ANYOF_FOLD) {
5806 if (flags & ANYOF_LOCALE) {
5807 PL_reg_flags |= RF_tainted;
5808 f = PL_fold_locale[c];
5812 if (f != c && ANYOF_BITMAP_TEST(n, f))
5816 if (!match && (flags & ANYOF_CLASS)) {
5817 PL_reg_flags |= RF_tainted;
5819 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5820 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5821 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5822 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5825 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5826 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5827 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5828 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5829 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5830 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5831 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5832 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5833 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5834 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5835 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5836 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5837 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5838 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5839 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5840 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5841 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5842 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5843 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5844 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5845 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5846 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5847 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5848 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5849 ) /* How's that for a conditional? */
5856 return (flags & ANYOF_INVERT) ? !match : match;
5860 S_reghop3(U8 *s, I32 off, const U8* lim)
5864 PERL_ARGS_ASSERT_REGHOP3;
5867 while (off-- && s < lim) {
5868 /* XXX could check well-formedness here */
5873 while (off++ && s > lim) {
5875 if (UTF8_IS_CONTINUED(*s)) {
5876 while (s > lim && UTF8_IS_CONTINUATION(*s))
5879 /* XXX could check well-formedness here */
5886 /* there are a bunch of places where we use two reghop3's that should
5887 be replaced with this routine. but since thats not done yet
5888 we ifdef it out - dmq
5891 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5895 PERL_ARGS_ASSERT_REGHOP4;
5898 while (off-- && s < rlim) {
5899 /* XXX could check well-formedness here */
5904 while (off++ && s > llim) {
5906 if (UTF8_IS_CONTINUED(*s)) {
5907 while (s > llim && UTF8_IS_CONTINUATION(*s))
5910 /* XXX could check well-formedness here */
5918 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5922 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5925 while (off-- && s < lim) {
5926 /* XXX could check well-formedness here */
5933 while (off++ && s > lim) {
5935 if (UTF8_IS_CONTINUED(*s)) {
5936 while (s > lim && UTF8_IS_CONTINUATION(*s))
5939 /* XXX could check well-formedness here */
5948 restore_pos(pTHX_ void *arg)
5951 regexp * const rex = (regexp *)arg;
5952 if (PL_reg_eval_set) {
5953 if (PL_reg_oldsaved) {
5954 rex->subbeg = PL_reg_oldsaved;
5955 rex->sublen = PL_reg_oldsavedlen;
5956 #ifdef PERL_OLD_COPY_ON_WRITE
5957 rex->saved_copy = PL_nrs;
5959 RXp_MATCH_COPIED_on(rex);
5961 PL_reg_magic->mg_len = PL_reg_oldpos;
5962 PL_reg_eval_set = 0;
5963 PL_curpm = PL_reg_oldcurpm;
5968 S_to_utf8_substr(pTHX_ register regexp *prog)
5972 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5975 if (prog->substrs->data[i].substr
5976 && !prog->substrs->data[i].utf8_substr) {
5977 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5978 prog->substrs->data[i].utf8_substr = sv;
5979 sv_utf8_upgrade(sv);
5980 if (SvVALID(prog->substrs->data[i].substr)) {
5981 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5982 if (flags & FBMcf_TAIL) {
5983 /* Trim the trailing \n that fbm_compile added last
5985 SvCUR_set(sv, SvCUR(sv) - 1);
5986 /* Whilst this makes the SV technically "invalid" (as its
5987 buffer is no longer followed by "\0") when fbm_compile()
5988 adds the "\n" back, a "\0" is restored. */
5990 fbm_compile(sv, flags);
5992 if (prog->substrs->data[i].substr == prog->check_substr)
5993 prog->check_utf8 = sv;
5999 S_to_byte_substr(pTHX_ register regexp *prog)
6004 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6007 if (prog->substrs->data[i].utf8_substr
6008 && !prog->substrs->data[i].substr) {
6009 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6010 if (sv_utf8_downgrade(sv, TRUE)) {
6011 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6013 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6014 if (flags & FBMcf_TAIL) {
6015 /* Trim the trailing \n that fbm_compile added last
6017 SvCUR_set(sv, SvCUR(sv) - 1);
6019 fbm_compile(sv, flags);
6025 prog->substrs->data[i].substr = sv;
6026 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6027 prog->check_substr = sv;
6034 * c-indentation-style: bsd
6036 * indent-tabs-mode: t
6039 * ex: set ts=8 sts=4 sw=4 noet: