5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
77 #ifdef PERL_IN_XSUB_RE
83 #define RF_tainted 1 /* tainted information used? */
84 #define RF_warned 2 /* warned about big count? */
86 #define RF_utf8 8 /* Pattern contains multibyte chars? */
88 #define UTF ((PL_reg_flags & RF_utf8) != 0)
90 #define RS_init 1 /* eval environment created */
91 #define RS_set 2 /* replsv value is set */
97 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
103 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
104 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
106 #define HOPc(pos,off) \
107 (char *)(PL_reg_match_utf8 \
108 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
110 #define HOPBACKc(pos, off) \
111 (char*)(PL_reg_match_utf8\
112 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
113 : (pos - off >= PL_bostr) \
117 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
118 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
120 /* these are unrolled below in the CCC_TRY_XXX defined */
121 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
122 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
123 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
124 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
125 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
126 #define LOAD_UTF8_CHARCLASS_MARK() LOAD_UTF8_CHARCLASS(mark, "\xcd\x86")
130 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
131 so that it is possible to override the option here without having to
132 rebuild the entire core. as we are required to do if we change regcomp.h
133 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
135 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
136 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
139 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
140 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
141 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
142 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
143 #define RE_utf8_perl_word PL_utf8_alnum
144 #define RE_utf8_perl_space PL_utf8_space
145 #define RE_utf8_posix_digit PL_utf8_digit
146 #define perl_word alnum
147 #define perl_space space
148 #define posix_digit digit
150 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
151 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
152 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
153 #define RE_utf8_perl_word PL_utf8_perl_word
154 #define RE_utf8_perl_space PL_utf8_perl_space
155 #define RE_utf8_posix_digit PL_utf8_posix_digit
159 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
161 PL_reg_flags |= RF_tainted; \
166 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
167 if (!CAT2(PL_utf8_,CLASS)) { \
171 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
175 if (!(OP(scan) == NAME \
176 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
177 : LCFUNC_utf8((U8*)locinput))) \
181 locinput += PL_utf8skip[nextchr]; \
182 nextchr = UCHARAT(locinput); \
185 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
187 nextchr = UCHARAT(++locinput); \
190 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
192 PL_reg_flags |= RF_tainted; \
195 if (!nextchr && locinput >= PL_regeol) \
197 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
198 if (!CAT2(PL_utf8_,CLASS)) { \
202 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
206 if ((OP(scan) == NAME \
207 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
208 : LCFUNC_utf8((U8*)locinput))) \
212 locinput += PL_utf8skip[nextchr]; \
213 nextchr = UCHARAT(locinput); \
216 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
218 nextchr = UCHARAT(++locinput); \
225 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
227 /* for use after a quantifier and before an EXACT-like node -- japhy */
228 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
229 #define JUMPABLE(rn) ( \
231 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
233 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
234 OP(rn) == PLUS || OP(rn) == MINMOD || \
235 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
236 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
238 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
240 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
243 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
244 we don't need this definition. */
245 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
246 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
247 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
250 /* ... so we use this as its faster. */
251 #define IS_TEXT(rn) ( OP(rn)==EXACT )
252 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
253 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
258 Search for mandatory following text node; for lookahead, the text must
259 follow but for lookbehind (rn->flags != 0) we skip to the next step.
261 #define FIND_NEXT_IMPT(rn) STMT_START { \
262 while (JUMPABLE(rn)) { \
263 const OPCODE type = OP(rn); \
264 if (type == SUSPEND || PL_regkind[type] == CURLY) \
265 rn = NEXTOPER(NEXTOPER(rn)); \
266 else if (type == PLUS) \
268 else if (type == IFMATCH) \
269 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
270 else rn += NEXT_OFF(rn); \
275 static void restore_pos(pTHX_ void *arg);
278 S_regcppush(pTHX_ I32 parenfloor)
281 const int retval = PL_savestack_ix;
282 #define REGCP_PAREN_ELEMS 4
283 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
285 GET_RE_DEBUG_FLAGS_DECL;
287 if (paren_elems_to_push < 0)
288 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
290 #define REGCP_OTHER_ELEMS 7
291 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
293 for (p = PL_regsize; p > parenfloor; p--) {
294 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
295 SSPUSHINT(PL_regoffs[p].end);
296 SSPUSHINT(PL_regoffs[p].start);
297 SSPUSHPTR(PL_reg_start_tmp[p]);
299 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
300 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
301 (UV)p, (IV)PL_regoffs[p].start,
302 (IV)(PL_reg_start_tmp[p] - PL_bostr),
303 (IV)PL_regoffs[p].end
306 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
307 SSPUSHPTR(PL_regoffs);
308 SSPUSHINT(PL_regsize);
309 SSPUSHINT(*PL_reglastparen);
310 SSPUSHINT(*PL_reglastcloseparen);
311 SSPUSHPTR(PL_reginput);
312 #define REGCP_FRAME_ELEMS 2
313 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
314 * are needed for the regexp context stack bookkeeping. */
315 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
316 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
321 /* These are needed since we do not localize EVAL nodes: */
322 #define REGCP_SET(cp) \
324 PerlIO_printf(Perl_debug_log, \
325 " Setting an EVAL scope, savestack=%"IVdf"\n", \
326 (IV)PL_savestack_ix)); \
329 #define REGCP_UNWIND(cp) \
331 if (cp != PL_savestack_ix) \
332 PerlIO_printf(Perl_debug_log, \
333 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
334 (IV)(cp), (IV)PL_savestack_ix)); \
338 S_regcppop(pTHX_ const regexp *rex)
343 GET_RE_DEBUG_FLAGS_DECL;
345 PERL_ARGS_ASSERT_REGCPPOP;
347 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
349 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
350 i = SSPOPINT; /* Parentheses elements to pop. */
351 input = (char *) SSPOPPTR;
352 *PL_reglastcloseparen = SSPOPINT;
353 *PL_reglastparen = SSPOPINT;
354 PL_regsize = SSPOPINT;
355 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
358 /* Now restore the parentheses context. */
359 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
360 i > 0; i -= REGCP_PAREN_ELEMS) {
362 U32 paren = (U32)SSPOPINT;
363 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
364 PL_regoffs[paren].start = SSPOPINT;
366 if (paren <= *PL_reglastparen)
367 PL_regoffs[paren].end = tmps;
369 PerlIO_printf(Perl_debug_log,
370 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
371 (UV)paren, (IV)PL_regoffs[paren].start,
372 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
373 (IV)PL_regoffs[paren].end,
374 (paren > *PL_reglastparen ? "(no)" : ""));
378 if (*PL_reglastparen + 1 <= rex->nparens) {
379 PerlIO_printf(Perl_debug_log,
380 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
381 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
385 /* It would seem that the similar code in regtry()
386 * already takes care of this, and in fact it is in
387 * a better location to since this code can #if 0-ed out
388 * but the code in regtry() is needed or otherwise tests
389 * requiring null fields (pat.t#187 and split.t#{13,14}
390 * (as of patchlevel 7877) will fail. Then again,
391 * this code seems to be necessary or otherwise
392 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
393 * --jhi updated by dapm */
394 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
396 PL_regoffs[i].start = -1;
397 PL_regoffs[i].end = -1;
403 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
406 * pregexec and friends
409 #ifndef PERL_IN_XSUB_RE
411 - pregexec - match a regexp against a string
414 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
415 char *strbeg, I32 minend, SV *screamer, U32 nosave)
416 /* strend: pointer to null at end of string */
417 /* strbeg: real beginning of string */
418 /* minend: end of match must be >=minend after stringarg. */
419 /* nosave: For optimizations. */
421 PERL_ARGS_ASSERT_PREGEXEC;
424 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
425 nosave ? 0 : REXEC_COPY_STR);
430 * Need to implement the following flags for reg_anch:
432 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
434 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
435 * INTUIT_AUTORITATIVE_ML
436 * INTUIT_ONCE_NOML - Intuit can match in one location only.
439 * Another flag for this function: SECOND_TIME (so that float substrs
440 * with giant delta may be not rechecked).
443 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
445 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
446 Otherwise, only SvCUR(sv) is used to get strbeg. */
448 /* XXXX We assume that strpos is strbeg unless sv. */
450 /* XXXX Some places assume that there is a fixed substring.
451 An update may be needed if optimizer marks as "INTUITable"
452 RExen without fixed substrings. Similarly, it is assumed that
453 lengths of all the strings are no more than minlen, thus they
454 cannot come from lookahead.
455 (Or minlen should take into account lookahead.)
456 NOTE: Some of this comment is not correct. minlen does now take account
457 of lookahead/behind. Further research is required. -- demerphq
461 /* A failure to find a constant substring means that there is no need to make
462 an expensive call to REx engine, thus we celebrate a failure. Similarly,
463 finding a substring too deep into the string means that less calls to
464 regtry() should be needed.
466 REx compiler's optimizer found 4 possible hints:
467 a) Anchored substring;
469 c) Whether we are anchored (beginning-of-line or \G);
470 d) First node (of those at offset 0) which may distingush positions;
471 We use a)b)d) and multiline-part of c), and try to find a position in the
472 string which does not contradict any of them.
475 /* Most of decisions we do here should have been done at compile time.
476 The nodes of the REx which we used for the search should have been
477 deleted from the finite automaton. */
480 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
481 char *strend, const U32 flags, re_scream_pos_data *data)
484 struct regexp *const prog = (struct regexp *)SvANY(rx);
485 register I32 start_shift = 0;
486 /* Should be nonnegative! */
487 register I32 end_shift = 0;
492 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
494 register char *other_last = NULL; /* other substr checked before this */
495 char *check_at = NULL; /* check substr found at this pos */
496 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
497 RXi_GET_DECL(prog,progi);
499 const char * const i_strpos = strpos;
501 GET_RE_DEBUG_FLAGS_DECL;
503 PERL_ARGS_ASSERT_RE_INTUIT_START;
505 RX_MATCH_UTF8_set(rx,do_utf8);
508 PL_reg_flags |= RF_utf8;
511 debug_start_match(rx, do_utf8, strpos, strend,
512 sv ? "Guessing start of match in sv for"
513 : "Guessing start of match in string for");
516 /* CHR_DIST() would be more correct here but it makes things slow. */
517 if (prog->minlen > strend - strpos) {
518 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
519 "String too short... [re_intuit_start]\n"));
523 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
526 if (!prog->check_utf8 && prog->check_substr)
527 to_utf8_substr(prog);
528 check = prog->check_utf8;
530 if (!prog->check_substr && prog->check_utf8)
531 to_byte_substr(prog);
532 check = prog->check_substr;
534 if (check == &PL_sv_undef) {
535 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
536 "Non-utf8 string cannot match utf8 check string\n"));
539 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
540 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
541 || ( (prog->extflags & RXf_ANCH_BOL)
542 && !multiline ) ); /* Check after \n? */
545 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
546 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
547 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
549 && (strpos != strbeg)) {
550 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
553 if (prog->check_offset_min == prog->check_offset_max &&
554 !(prog->extflags & RXf_CANY_SEEN)) {
555 /* Substring at constant offset from beg-of-str... */
558 s = HOP3c(strpos, prog->check_offset_min, strend);
561 slen = SvCUR(check); /* >= 1 */
563 if ( strend - s > slen || strend - s < slen - 1
564 || (strend - s == slen && strend[-1] != '\n')) {
565 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
568 /* Now should match s[0..slen-2] */
570 if (slen && (*SvPVX_const(check) != *s
572 && memNE(SvPVX_const(check), s, slen)))) {
574 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
578 else if (*SvPVX_const(check) != *s
579 || ((slen = SvCUR(check)) > 1
580 && memNE(SvPVX_const(check), s, slen)))
583 goto success_at_start;
586 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
588 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
589 end_shift = prog->check_end_shift;
592 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
593 - (SvTAIL(check) != 0);
594 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
596 if (end_shift < eshift)
600 else { /* Can match at random position */
603 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
604 end_shift = prog->check_end_shift;
606 /* end shift should be non negative here */
609 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
611 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
612 (IV)end_shift, RX_PRECOMP(prog));
616 /* Find a possible match in the region s..strend by looking for
617 the "check" substring in the region corrected by start/end_shift. */
620 I32 srch_start_shift = start_shift;
621 I32 srch_end_shift = end_shift;
622 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
623 srch_end_shift -= ((strbeg - s) - srch_start_shift);
624 srch_start_shift = strbeg - s;
626 DEBUG_OPTIMISE_MORE_r({
627 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
628 (IV)prog->check_offset_min,
629 (IV)srch_start_shift,
631 (IV)prog->check_end_shift);
634 if (flags & REXEC_SCREAM) {
635 I32 p = -1; /* Internal iterator of scream. */
636 I32 * const pp = data ? data->scream_pos : &p;
638 if (PL_screamfirst[BmRARE(check)] >= 0
639 || ( BmRARE(check) == '\n'
640 && (BmPREVIOUS(check) == SvCUR(check) - 1)
642 s = screaminstr(sv, check,
643 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
646 /* we may be pointing at the wrong string */
647 if (s && RXp_MATCH_COPIED(prog))
648 s = strbeg + (s - SvPVX_const(sv));
650 *data->scream_olds = s;
655 if (prog->extflags & RXf_CANY_SEEN) {
656 start_point= (U8*)(s + srch_start_shift);
657 end_point= (U8*)(strend - srch_end_shift);
659 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
660 end_point= HOP3(strend, -srch_end_shift, strbeg);
662 DEBUG_OPTIMISE_MORE_r({
663 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
664 (int)(end_point - start_point),
665 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
669 s = fbm_instr( start_point, end_point,
670 check, multiline ? FBMrf_MULTILINE : 0);
673 /* Update the count-of-usability, remove useless subpatterns,
677 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
678 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
679 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
680 (s ? "Found" : "Did not find"),
681 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
682 ? "anchored" : "floating"),
685 (s ? " at offset " : "...\n") );
690 /* Finish the diagnostic message */
691 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
693 /* XXX dmq: first branch is for positive lookbehind...
694 Our check string is offset from the beginning of the pattern.
695 So we need to do any stclass tests offset forward from that
704 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
705 Start with the other substr.
706 XXXX no SCREAM optimization yet - and a very coarse implementation
707 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
708 *always* match. Probably should be marked during compile...
709 Probably it is right to do no SCREAM here...
712 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
713 : (prog->float_substr && prog->anchored_substr))
715 /* Take into account the "other" substring. */
716 /* XXXX May be hopelessly wrong for UTF... */
719 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
722 char * const last = HOP3c(s, -start_shift, strbeg);
724 char * const saved_s = s;
727 t = s - prog->check_offset_max;
728 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
730 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
735 t = HOP3c(t, prog->anchored_offset, strend);
736 if (t < other_last) /* These positions already checked */
738 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
741 /* XXXX It is not documented what units *_offsets are in.
742 We assume bytes, but this is clearly wrong.
743 Meaning this code needs to be carefully reviewed for errors.
747 /* On end-of-str: see comment below. */
748 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
749 if (must == &PL_sv_undef) {
751 DEBUG_r(must = prog->anchored_utf8); /* for debug */
756 HOP3(HOP3(last1, prog->anchored_offset, strend)
757 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
759 multiline ? FBMrf_MULTILINE : 0
762 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
763 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
764 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
765 (s ? "Found" : "Contradicts"),
766 quoted, RE_SV_TAIL(must));
771 if (last1 >= last2) {
772 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
773 ", giving up...\n"));
776 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
777 ", trying floating at offset %ld...\n",
778 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
779 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
780 s = HOP3c(last, 1, strend);
784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
785 (long)(s - i_strpos)));
786 t = HOP3c(s, -prog->anchored_offset, strbeg);
787 other_last = HOP3c(s, 1, strend);
795 else { /* Take into account the floating substring. */
797 char * const saved_s = s;
800 t = HOP3c(s, -start_shift, strbeg);
802 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
803 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
804 last = HOP3c(t, prog->float_max_offset, strend);
805 s = HOP3c(t, prog->float_min_offset, strend);
808 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
809 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
810 /* fbm_instr() takes into account exact value of end-of-str
811 if the check is SvTAIL(ed). Since false positives are OK,
812 and end-of-str is not later than strend we are OK. */
813 if (must == &PL_sv_undef) {
815 DEBUG_r(must = prog->float_utf8); /* for debug message */
818 s = fbm_instr((unsigned char*)s,
819 (unsigned char*)last + SvCUR(must)
821 must, multiline ? FBMrf_MULTILINE : 0);
823 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
824 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
825 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
826 (s ? "Found" : "Contradicts"),
827 quoted, RE_SV_TAIL(must));
831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
832 ", giving up...\n"));
835 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
836 ", trying anchored starting at offset %ld...\n",
837 (long)(saved_s + 1 - i_strpos)));
839 s = HOP3c(t, 1, strend);
843 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
844 (long)(s - i_strpos)));
845 other_last = s; /* Fix this later. --Hugo */
855 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
857 DEBUG_OPTIMISE_MORE_r(
858 PerlIO_printf(Perl_debug_log,
859 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
860 (IV)prog->check_offset_min,
861 (IV)prog->check_offset_max,
869 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
871 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
874 /* Fixed substring is found far enough so that the match
875 cannot start at strpos. */
877 if (ml_anch && t[-1] != '\n') {
878 /* Eventually fbm_*() should handle this, but often
879 anchored_offset is not 0, so this check will not be wasted. */
880 /* XXXX In the code below we prefer to look for "^" even in
881 presence of anchored substrings. And we search even
882 beyond the found float position. These pessimizations
883 are historical artefacts only. */
885 while (t < strend - prog->minlen) {
887 if (t < check_at - prog->check_offset_min) {
888 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
889 /* Since we moved from the found position,
890 we definitely contradict the found anchored
891 substr. Due to the above check we do not
892 contradict "check" substr.
893 Thus we can arrive here only if check substr
894 is float. Redo checking for "other"=="fixed".
897 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
898 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
899 goto do_other_anchored;
901 /* We don't contradict the found floating substring. */
902 /* XXXX Why not check for STCLASS? */
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
905 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
908 /* Position contradicts check-string */
909 /* XXXX probably better to look for check-string
910 than for "\n", so one should lower the limit for t? */
911 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
912 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
913 other_last = strpos = s = t + 1;
918 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
919 PL_colors[0], PL_colors[1]));
923 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
924 PL_colors[0], PL_colors[1]));
928 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
931 /* The found string does not prohibit matching at strpos,
932 - no optimization of calling REx engine can be performed,
933 unless it was an MBOL and we are not after MBOL,
934 or a future STCLASS check will fail this. */
936 /* Even in this situation we may use MBOL flag if strpos is offset
937 wrt the start of the string. */
938 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
939 && (strpos != strbeg) && strpos[-1] != '\n'
940 /* May be due to an implicit anchor of m{.*foo} */
941 && !(prog->intflags & PREGf_IMPLICIT))
946 DEBUG_EXECUTE_r( if (ml_anch)
947 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
948 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
951 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
953 prog->check_utf8 /* Could be deleted already */
954 && --BmUSEFUL(prog->check_utf8) < 0
955 && (prog->check_utf8 == prog->float_utf8)
957 prog->check_substr /* Could be deleted already */
958 && --BmUSEFUL(prog->check_substr) < 0
959 && (prog->check_substr == prog->float_substr)
962 /* If flags & SOMETHING - do not do it many times on the same match */
963 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
964 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
965 if (do_utf8 ? prog->check_substr : prog->check_utf8)
966 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
967 prog->check_substr = prog->check_utf8 = NULL; /* disable */
968 prog->float_substr = prog->float_utf8 = NULL; /* clear */
969 check = NULL; /* abort */
971 /* XXXX This is a remnant of the old implementation. It
972 looks wasteful, since now INTUIT can use many
974 prog->extflags &= ~RXf_USE_INTUIT;
981 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
982 /* trie stclasses are too expensive to use here, we are better off to
983 leave it to regmatch itself */
984 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
985 /* minlen == 0 is possible if regstclass is \b or \B,
986 and the fixed substr is ''$.
987 Since minlen is already taken into account, s+1 is before strend;
988 accidentally, minlen >= 1 guaranties no false positives at s + 1
989 even for \b or \B. But (minlen? 1 : 0) below assumes that
990 regstclass does not come from lookahead... */
991 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
992 This leaves EXACTF only, which is dealt with in find_byclass(). */
993 const U8* const str = (U8*)STRING(progi->regstclass);
994 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
995 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
998 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
999 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1000 else if (prog->float_substr || prog->float_utf8)
1001 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1005 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1006 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1009 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1012 const char *what = NULL;
1014 if (endpos == strend) {
1015 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1016 "Could not match STCLASS...\n") );
1019 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1020 "This position contradicts STCLASS...\n") );
1021 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1023 /* Contradict one of substrings */
1024 if (prog->anchored_substr || prog->anchored_utf8) {
1025 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1026 DEBUG_EXECUTE_r( what = "anchored" );
1028 s = HOP3c(t, 1, strend);
1029 if (s + start_shift + end_shift > strend) {
1030 /* XXXX Should be taken into account earlier? */
1031 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1032 "Could not match STCLASS...\n") );
1037 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1038 "Looking for %s substr starting at offset %ld...\n",
1039 what, (long)(s + start_shift - i_strpos)) );
1042 /* Have both, check_string is floating */
1043 if (t + start_shift >= check_at) /* Contradicts floating=check */
1044 goto retry_floating_check;
1045 /* Recheck anchored substring, but not floating... */
1049 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1050 "Looking for anchored substr starting at offset %ld...\n",
1051 (long)(other_last - i_strpos)) );
1052 goto do_other_anchored;
1054 /* Another way we could have checked stclass at the
1055 current position only: */
1060 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1061 "Looking for /%s^%s/m starting at offset %ld...\n",
1062 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1065 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1067 /* Check is floating subtring. */
1068 retry_floating_check:
1069 t = check_at - start_shift;
1070 DEBUG_EXECUTE_r( what = "floating" );
1071 goto hop_and_restart;
1074 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1075 "By STCLASS: moving %ld --> %ld\n",
1076 (long)(t - i_strpos), (long)(s - i_strpos))
1080 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1081 "Does not contradict STCLASS...\n");
1086 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1087 PL_colors[4], (check ? "Guessed" : "Giving up"),
1088 PL_colors[5], (long)(s - i_strpos)) );
1091 fail_finish: /* Substring not found */
1092 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1093 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1095 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1096 PL_colors[4], PL_colors[5]));
1100 #define DECL_TRIE_TYPE(scan) \
1101 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1102 trie_type = (scan->flags != EXACT) \
1103 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1104 : (do_utf8 ? trie_utf8 : trie_plain)
1106 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1107 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1108 UV uvc_unfolded = 0; \
1109 switch (trie_type) { \
1110 case trie_utf8_fold: \
1111 if ( foldlen>0 ) { \
1112 uvc_unfolded = uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1117 uvc_unfolded = uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1118 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1119 foldlen -= UNISKIP( uvc ); \
1120 uscan = foldbuf + UNISKIP( uvc ); \
1123 case trie_latin_utf8_fold: \
1124 if ( foldlen>0 ) { \
1125 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1131 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1132 foldlen -= UNISKIP( uvc ); \
1133 uscan = foldbuf + UNISKIP( uvc ); \
1137 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1145 charid = trie->charmap[ uvc ]; \
1149 if (widecharmap) { \
1150 SV** const svpp = hv_fetch(widecharmap, \
1151 (char*)&uvc, sizeof(UV), 0); \
1153 charid = (U16)SvIV(*svpp); \
1156 if (!charid && trie_type == trie_utf8_fold && !UTF) { \
1157 charid = trie->charmap[uvc_unfolded]; \
1161 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1163 char *my_strend= (char *)strend; \
1166 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
1167 m, NULL, ln, (bool)UTF)) \
1168 && (!reginfo || regtry(reginfo, &s)) ) \
1171 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1172 uvchr_to_utf8(tmpbuf, c); \
1173 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1175 && (f == c1 || f == c2) \
1177 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1178 m, NULL, ln, (bool)UTF)) \
1179 && (!reginfo || regtry(reginfo, &s)) ) \
1185 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1189 && (ln == 1 || !(OP(c) == EXACTF \
1191 : ibcmp_locale(s, m, ln))) \
1192 && (!reginfo || regtry(reginfo, &s)) ) \
1198 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1200 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1206 #define REXEC_FBC_SCAN(CoDe) \
1208 while (s < strend) { \
1214 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1215 REXEC_FBC_UTF8_SCAN( \
1217 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1226 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1229 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1238 #define REXEC_FBC_TRYIT \
1239 if ((!reginfo || regtry(reginfo, &s))) \
1242 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1244 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1247 REXEC_FBC_CLASS_SCAN(CoNd); \
1251 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1254 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1257 REXEC_FBC_CLASS_SCAN(CoNd); \
1261 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1262 PL_reg_flags |= RF_tainted; \
1264 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1267 REXEC_FBC_CLASS_SCAN(CoNd); \
1271 #define DUMP_EXEC_POS(li,s,doutf8) \
1272 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1274 /* We know what class REx starts with. Try to find this position... */
1275 /* if reginfo is NULL, its a dryrun */
1276 /* annoyingly all the vars in this routine have different names from their counterparts
1277 in regmatch. /grrr */
1280 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1281 const char *strend, regmatch_info *reginfo)
1284 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1288 register STRLEN uskip;
1292 register I32 tmp = 1; /* Scratch variable? */
1293 register const bool do_utf8 = PL_reg_match_utf8;
1294 RXi_GET_DECL(prog,progi);
1296 PERL_ARGS_ASSERT_FIND_BYCLASS;
1298 /* We know what class it must start with. */
1302 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1303 !UTF8_IS_INVARIANT((U8)s[0]) ?
1304 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1305 REGINCLASS(prog, c, (U8*)s));
1308 while (s < strend) {
1311 if (REGINCLASS(prog, c, (U8*)s) ||
1312 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1313 /* The assignment of 2 is intentional:
1314 * for the folded sharp s, the skip is 2. */
1315 (skip = SHARP_S_SKIP))) {
1316 if (tmp && (!reginfo || regtry(reginfo, &s)))
1329 if (tmp && (!reginfo || regtry(reginfo, &s)))
1337 ln = STR_LEN(c); /* length to match in octets/bytes */
1338 lnc = (I32) ln; /* length to match in characters */
1340 STRLEN ulen1, ulen2;
1342 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1343 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1344 /* used by commented-out code below */
1345 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1347 /* XXX: Since the node will be case folded at compile
1348 time this logic is a little odd, although im not
1349 sure that its actually wrong. --dmq */
1351 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1352 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1354 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1355 codepoint of the first character in the converted
1356 form, yet originally we did the extra step.
1357 No tests fail by commenting this code out however
1358 so Ive left it out. -- dmq.
1360 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1362 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1367 while (sm < ((U8 *) m + ln)) {
1382 c2 = PL_fold_locale[c1];
1384 e = HOP3c(strend, -((I32)lnc), s);
1386 if (!reginfo && e < s)
1387 e = s; /* Due to minlen logic of intuit() */
1389 /* The idea in the EXACTF* cases is to first find the
1390 * first character of the EXACTF* node and then, if
1391 * necessary, case-insensitively compare the full
1392 * text of the node. The c1 and c2 are the first
1393 * characters (though in Unicode it gets a bit
1394 * more complicated because there are more cases
1395 * than just upper and lower: one needs to use
1396 * the so-called folding case for case-insensitive
1397 * matching (called "loose matching" in Unicode).
1398 * ibcmp_utf8() will do just that. */
1400 if (do_utf8 || UTF) {
1402 U8 tmpbuf [UTF8_MAXBYTES+1];
1405 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1407 /* Upper and lower of 1st char are equal -
1408 * probably not a "letter". */
1411 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1416 REXEC_FBC_EXACTISH_CHECK(c == c1);
1422 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1428 /* Handle some of the three Greek sigmas cases.
1429 * Note that not all the possible combinations
1430 * are handled here: some of them are handled
1431 * by the standard folding rules, and some of
1432 * them (the character class or ANYOF cases)
1433 * are handled during compiletime in
1434 * regexec.c:S_regclass(). */
1435 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1436 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1437 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1439 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1444 /* Neither pattern nor string are UTF8 */
1446 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1448 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1452 PL_reg_flags |= RF_tainted;
1459 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1460 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1462 tmp = ((OP(c) == BOUND ?
1463 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1464 LOAD_UTF8_CHARCLASS_ALNUM();
1465 REXEC_FBC_UTF8_SCAN(
1466 if (tmp == !(OP(c) == BOUND ?
1467 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1468 isALNUM_LC_utf8((U8*)s)))
1476 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1477 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1480 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1486 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1490 PL_reg_flags |= RF_tainted;
1497 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1498 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1500 tmp = ((OP(c) == NBOUND ?
1501 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1502 LOAD_UTF8_CHARCLASS_ALNUM();
1503 REXEC_FBC_UTF8_SCAN(
1504 if (tmp == !(OP(c) == NBOUND ?
1505 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1506 isALNUM_LC_utf8((U8*)s)))
1508 else REXEC_FBC_TRYIT;
1512 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1513 tmp = ((OP(c) == NBOUND ?
1514 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1517 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1519 else REXEC_FBC_TRYIT;
1522 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1526 REXEC_FBC_CSCAN_PRELOAD(
1527 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1528 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1532 REXEC_FBC_CSCAN_TAINT(
1533 isALNUM_LC_utf8((U8*)s),
1537 REXEC_FBC_CSCAN_PRELOAD(
1538 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1539 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1543 REXEC_FBC_CSCAN_TAINT(
1544 !isALNUM_LC_utf8((U8*)s),
1548 REXEC_FBC_CSCAN_PRELOAD(
1549 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1550 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1554 REXEC_FBC_CSCAN_TAINT(
1555 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1559 REXEC_FBC_CSCAN_PRELOAD(
1560 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1561 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1565 REXEC_FBC_CSCAN_TAINT(
1566 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1570 REXEC_FBC_CSCAN_PRELOAD(
1571 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1572 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1576 REXEC_FBC_CSCAN_TAINT(
1577 isDIGIT_LC_utf8((U8*)s),
1581 REXEC_FBC_CSCAN_PRELOAD(
1582 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1583 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1587 REXEC_FBC_CSCAN_TAINT(
1588 !isDIGIT_LC_utf8((U8*)s),
1594 is_LNBREAK_latin1(s)
1604 !is_VERTWS_latin1(s)
1609 is_HORIZWS_latin1(s)
1613 !is_HORIZWS_utf8(s),
1614 !is_HORIZWS_latin1(s)
1620 /* what trie are we using right now */
1622 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1624 = (reg_trie_data*)progi->data->data[ aho->trie ];
1625 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1627 const char *last_start = strend - trie->minlen;
1629 const char *real_start = s;
1631 STRLEN maxlen = trie->maxlen;
1633 U8 **points; /* map of where we were in the input string
1634 when reading a given char. For ASCII this
1635 is unnecessary overhead as the relationship
1636 is always 1:1, but for Unicode, especially
1637 case folded Unicode this is not true. */
1638 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1642 GET_RE_DEBUG_FLAGS_DECL;
1644 /* We can't just allocate points here. We need to wrap it in
1645 * an SV so it gets freed properly if there is a croak while
1646 * running the match */
1649 sv_points=newSV(maxlen * sizeof(U8 *));
1650 SvCUR_set(sv_points,
1651 maxlen * sizeof(U8 *));
1652 SvPOK_on(sv_points);
1653 sv_2mortal(sv_points);
1654 points=(U8**)SvPV_nolen(sv_points );
1655 if ( trie_type != trie_utf8_fold
1656 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1659 bitmap=(U8*)trie->bitmap;
1661 bitmap=(U8*)ANYOF_BITMAP(c);
1663 /* this is the Aho-Corasick algorithm modified a touch
1664 to include special handling for long "unknown char"
1665 sequences. The basic idea being that we use AC as long
1666 as we are dealing with a possible matching char, when
1667 we encounter an unknown char (and we have not encountered
1668 an accepting state) we scan forward until we find a legal
1670 AC matching is basically that of trie matching, except
1671 that when we encounter a failing transition, we fall back
1672 to the current states "fail state", and try the current char
1673 again, a process we repeat until we reach the root state,
1674 state 1, or a legal transition. If we fail on the root state
1675 then we can either terminate if we have reached an accepting
1676 state previously, or restart the entire process from the beginning
1680 while (s <= last_start) {
1681 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1689 U8 *uscan = (U8*)NULL;
1690 U8 *leftmost = NULL;
1692 U32 accepted_word= 0;
1696 while ( state && uc <= (U8*)strend ) {
1698 U32 word = aho->states[ state ].wordnum;
1702 DEBUG_TRIE_EXECUTE_r(
1703 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1704 dump_exec_pos( (char *)uc, c, strend, real_start,
1705 (char *)uc, do_utf8 );
1706 PerlIO_printf( Perl_debug_log,
1707 " Scanning for legal start char...\n");
1710 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1715 if (uc >(U8*)last_start) break;
1719 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1720 if (!leftmost || lpos < leftmost) {
1721 DEBUG_r(accepted_word=word);
1727 points[pointpos++ % maxlen]= uc;
1728 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1729 uscan, len, uvc, charid, foldlen,
1731 DEBUG_TRIE_EXECUTE_r({
1732 dump_exec_pos( (char *)uc, c, strend, real_start,
1734 PerlIO_printf(Perl_debug_log,
1735 " Charid:%3u CP:%4"UVxf" ",
1741 word = aho->states[ state ].wordnum;
1743 base = aho->states[ state ].trans.base;
1745 DEBUG_TRIE_EXECUTE_r({
1747 dump_exec_pos( (char *)uc, c, strend, real_start,
1749 PerlIO_printf( Perl_debug_log,
1750 "%sState: %4"UVxf", word=%"UVxf,
1751 failed ? " Fail transition to " : "",
1752 (UV)state, (UV)word);
1757 (base + charid > trie->uniquecharcount )
1758 && (base + charid - 1 - trie->uniquecharcount
1760 && trie->trans[base + charid - 1 -
1761 trie->uniquecharcount].check == state
1762 && (tmp=trie->trans[base + charid - 1 -
1763 trie->uniquecharcount ].next))
1765 DEBUG_TRIE_EXECUTE_r(
1766 PerlIO_printf( Perl_debug_log," - legal\n"));
1771 DEBUG_TRIE_EXECUTE_r(
1772 PerlIO_printf( Perl_debug_log," - fail\n"));
1774 state = aho->fail[state];
1778 /* we must be accepting here */
1779 DEBUG_TRIE_EXECUTE_r(
1780 PerlIO_printf( Perl_debug_log," - accepting\n"));
1789 if (!state) state = 1;
1792 if ( aho->states[ state ].wordnum ) {
1793 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1794 if (!leftmost || lpos < leftmost) {
1795 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1800 s = (char*)leftmost;
1801 DEBUG_TRIE_EXECUTE_r({
1803 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1804 (UV)accepted_word, (IV)(s - real_start)
1807 if (!reginfo || regtry(reginfo, &s)) {
1813 DEBUG_TRIE_EXECUTE_r({
1814 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1817 DEBUG_TRIE_EXECUTE_r(
1818 PerlIO_printf( Perl_debug_log,"No match.\n"));
1827 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1837 - regexec_flags - match a regexp against a string
1840 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1841 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1842 /* strend: pointer to null at end of string */
1843 /* strbeg: real beginning of string */
1844 /* minend: end of match must be >=minend after stringarg. */
1845 /* data: May be used for some additional optimizations.
1846 Currently its only used, with a U32 cast, for transmitting
1847 the ganch offset when doing a /g match. This will change */
1848 /* nosave: For optimizations. */
1851 struct regexp *const prog = (struct regexp *)SvANY(rx);
1852 /*register*/ char *s;
1853 register regnode *c;
1854 /*register*/ char *startpos = stringarg;
1855 I32 minlen; /* must match at least this many chars */
1856 I32 dontbother = 0; /* how many characters not to try at end */
1857 I32 end_shift = 0; /* Same for the end. */ /* CC */
1858 I32 scream_pos = -1; /* Internal iterator of scream. */
1859 char *scream_olds = NULL;
1860 const bool do_utf8 = (bool)DO_UTF8(sv);
1862 RXi_GET_DECL(prog,progi);
1863 regmatch_info reginfo; /* create some info to pass to regtry etc */
1864 regexp_paren_pair *swap = NULL;
1865 GET_RE_DEBUG_FLAGS_DECL;
1867 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1868 PERL_UNUSED_ARG(data);
1870 /* Be paranoid... */
1871 if (prog == NULL || startpos == NULL) {
1872 Perl_croak(aTHX_ "NULL regexp parameter");
1876 multiline = prog->extflags & RXf_PMf_MULTILINE;
1877 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1879 RX_MATCH_UTF8_set(rx, do_utf8);
1881 debug_start_match(rx, do_utf8, startpos, strend,
1885 minlen = prog->minlen;
1887 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1888 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1889 "String too short [regexec_flags]...\n"));
1894 /* Check validity of program. */
1895 if (UCHARAT(progi->program) != REG_MAGIC) {
1896 Perl_croak(aTHX_ "corrupted regexp program");
1900 PL_reg_eval_set = 0;
1904 PL_reg_flags |= RF_utf8;
1906 /* Mark beginning of line for ^ and lookbehind. */
1907 reginfo.bol = startpos; /* XXX not used ??? */
1911 /* Mark end of line for $ (and such) */
1914 /* see how far we have to get to not match where we matched before */
1915 reginfo.till = startpos+minend;
1917 /* If there is a "must appear" string, look for it. */
1920 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1922 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
1923 reginfo.ganch = startpos + prog->gofs;
1924 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1925 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1926 } else if (sv && SvTYPE(sv) >= SVt_PVMG
1928 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1929 && mg->mg_len >= 0) {
1930 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1931 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1932 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1934 if (prog->extflags & RXf_ANCH_GPOS) {
1935 if (s > reginfo.ganch)
1937 s = reginfo.ganch - prog->gofs;
1938 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1939 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1945 reginfo.ganch = strbeg + PTR2UV(data);
1946 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1947 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1949 } else { /* pos() not defined */
1950 reginfo.ganch = strbeg;
1951 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1952 "GPOS: reginfo.ganch = strbeg\n"));
1955 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1956 /* We have to be careful. If the previous successful match
1957 was from this regex we don't want a subsequent partially
1958 successful match to clobber the old results.
1959 So when we detect this possibility we add a swap buffer
1960 to the re, and switch the buffer each match. If we fail
1961 we switch it back, otherwise we leave it swapped.
1964 /* do we need a save destructor here for eval dies? */
1965 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1967 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1968 re_scream_pos_data d;
1970 d.scream_olds = &scream_olds;
1971 d.scream_pos = &scream_pos;
1972 s = re_intuit_start(rx, sv, s, strend, flags, &d);
1974 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1975 goto phooey; /* not present */
1981 /* Simplest case: anchored match need be tried only once. */
1982 /* [unless only anchor is BOL and multiline is set] */
1983 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
1984 if (s == startpos && regtry(®info, &startpos))
1986 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
1987 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
1992 dontbother = minlen - 1;
1993 end = HOP3c(strend, -dontbother, strbeg) - 1;
1994 /* for multiline we only have to try after newlines */
1995 if (prog->check_substr || prog->check_utf8) {
1999 if (regtry(®info, &s))
2004 if (prog->extflags & RXf_USE_INTUIT) {
2005 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2016 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2017 if (regtry(®info, &s))
2024 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2026 /* the warning about reginfo.ganch being used without intialization
2027 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2028 and we only enter this block when the same bit is set. */
2029 char *tmp_s = reginfo.ganch - prog->gofs;
2031 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2036 /* Messy cases: unanchored match. */
2037 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2038 /* we have /x+whatever/ */
2039 /* it must be a one character string (XXXX Except UTF?) */
2044 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2045 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2046 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2051 DEBUG_EXECUTE_r( did_match = 1 );
2052 if (regtry(®info, &s)) goto got_it;
2054 while (s < strend && *s == ch)
2062 DEBUG_EXECUTE_r( did_match = 1 );
2063 if (regtry(®info, &s)) goto got_it;
2065 while (s < strend && *s == ch)
2070 DEBUG_EXECUTE_r(if (!did_match)
2071 PerlIO_printf(Perl_debug_log,
2072 "Did not find anchored character...\n")
2075 else if (prog->anchored_substr != NULL
2076 || prog->anchored_utf8 != NULL
2077 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2078 && prog->float_max_offset < strend - s)) {
2083 char *last1; /* Last position checked before */
2087 if (prog->anchored_substr || prog->anchored_utf8) {
2088 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2089 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2090 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2091 back_max = back_min = prog->anchored_offset;
2093 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2094 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2095 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2096 back_max = prog->float_max_offset;
2097 back_min = prog->float_min_offset;
2101 if (must == &PL_sv_undef)
2102 /* could not downgrade utf8 check substring, so must fail */
2108 last = HOP3c(strend, /* Cannot start after this */
2109 -(I32)(CHR_SVLEN(must)
2110 - (SvTAIL(must) != 0) + back_min), strbeg);
2113 last1 = HOPc(s, -1);
2115 last1 = s - 1; /* bogus */
2117 /* XXXX check_substr already used to find "s", can optimize if
2118 check_substr==must. */
2120 dontbother = end_shift;
2121 strend = HOPc(strend, -dontbother);
2122 while ( (s <= last) &&
2123 ((flags & REXEC_SCREAM)
2124 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2125 end_shift, &scream_pos, 0))
2126 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2127 (unsigned char*)strend, must,
2128 multiline ? FBMrf_MULTILINE : 0))) ) {
2129 /* we may be pointing at the wrong string */
2130 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2131 s = strbeg + (s - SvPVX_const(sv));
2132 DEBUG_EXECUTE_r( did_match = 1 );
2133 if (HOPc(s, -back_max) > last1) {
2134 last1 = HOPc(s, -back_min);
2135 s = HOPc(s, -back_max);
2138 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2140 last1 = HOPc(s, -back_min);
2144 while (s <= last1) {
2145 if (regtry(®info, &s))
2151 while (s <= last1) {
2152 if (regtry(®info, &s))
2158 DEBUG_EXECUTE_r(if (!did_match) {
2159 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2160 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2161 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2162 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2163 ? "anchored" : "floating"),
2164 quoted, RE_SV_TAIL(must));
2168 else if ( (c = progi->regstclass) ) {
2170 const OPCODE op = OP(progi->regstclass);
2171 /* don't bother with what can't match */
2172 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2173 strend = HOPc(strend, -(minlen - 1));
2176 SV * const prop = sv_newmortal();
2177 regprop(prog, prop, c);
2179 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2181 PerlIO_printf(Perl_debug_log,
2182 "Matching stclass %.*s against %s (%d chars)\n",
2183 (int)SvCUR(prop), SvPVX_const(prop),
2184 quoted, (int)(strend - s));
2187 if (find_byclass(prog, c, s, strend, ®info))
2189 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2193 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2198 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2199 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2200 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2202 if (flags & REXEC_SCREAM) {
2203 last = screaminstr(sv, float_real, s - strbeg,
2204 end_shift, &scream_pos, 1); /* last one */
2206 last = scream_olds; /* Only one occurrence. */
2207 /* we may be pointing at the wrong string */
2208 else if (RXp_MATCH_COPIED(prog))
2209 s = strbeg + (s - SvPVX_const(sv));
2213 const char * const little = SvPV_const(float_real, len);
2215 if (SvTAIL(float_real)) {
2216 if (memEQ(strend - len + 1, little, len - 1))
2217 last = strend - len + 1;
2218 else if (!multiline)
2219 last = memEQ(strend - len, little, len)
2220 ? strend - len : NULL;
2226 last = rninstr(s, strend, little, little + len);
2228 last = strend; /* matching "$" */
2233 PerlIO_printf(Perl_debug_log,
2234 "%sCan't trim the tail, match fails (should not happen)%s\n",
2235 PL_colors[4], PL_colors[5]));
2236 goto phooey; /* Should not happen! */
2238 dontbother = strend - last + prog->float_min_offset;
2240 if (minlen && (dontbother < minlen))
2241 dontbother = minlen - 1;
2242 strend -= dontbother; /* this one's always in bytes! */
2243 /* We don't know much -- general case. */
2246 if (regtry(®info, &s))
2255 if (regtry(®info, &s))
2257 } while (s++ < strend);
2266 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2268 if (PL_reg_eval_set)
2269 restore_pos(aTHX_ prog);
2270 if (RXp_PAREN_NAMES(prog))
2271 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2273 /* make sure $`, $&, $', and $digit will work later */
2274 if ( !(flags & REXEC_NOT_FIRST) ) {
2275 RX_MATCH_COPY_FREE(rx);
2276 if (flags & REXEC_COPY_STR) {
2277 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2278 #ifdef PERL_OLD_COPY_ON_WRITE
2280 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2282 PerlIO_printf(Perl_debug_log,
2283 "Copy on write: regexp capture, type %d\n",
2286 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2287 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2288 assert (SvPOKp(prog->saved_copy));
2292 RX_MATCH_COPIED_on(rx);
2293 s = savepvn(strbeg, i);
2299 prog->subbeg = strbeg;
2300 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2307 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2308 PL_colors[4], PL_colors[5]));
2309 if (PL_reg_eval_set)
2310 restore_pos(aTHX_ prog);
2312 /* we failed :-( roll it back */
2313 Safefree(prog->offs);
2322 - regtry - try match at specific point
2324 STATIC I32 /* 0 failure, 1 success */
2325 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2329 REGEXP *const rx = reginfo->prog;
2330 regexp *const prog = (struct regexp *)SvANY(rx);
2331 RXi_GET_DECL(prog,progi);
2332 GET_RE_DEBUG_FLAGS_DECL;
2334 PERL_ARGS_ASSERT_REGTRY;
2336 reginfo->cutpoint=NULL;
2338 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2341 PL_reg_eval_set = RS_init;
2342 DEBUG_EXECUTE_r(DEBUG_s(
2343 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2344 (IV)(PL_stack_sp - PL_stack_base));
2347 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2348 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2350 /* Apparently this is not needed, judging by wantarray. */
2351 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2352 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2355 /* Make $_ available to executed code. */
2356 if (reginfo->sv != DEFSV) {
2358 DEFSV_set(reginfo->sv);
2361 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2362 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2363 /* prepare for quick setting of pos */
2364 #ifdef PERL_OLD_COPY_ON_WRITE
2365 if (SvIsCOW(reginfo->sv))
2366 sv_force_normal_flags(reginfo->sv, 0);
2368 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2369 &PL_vtbl_mglob, NULL, 0);
2373 PL_reg_oldpos = mg->mg_len;
2374 SAVEDESTRUCTOR_X(restore_pos, prog);
2376 if (!PL_reg_curpm) {
2377 Newxz(PL_reg_curpm, 1, PMOP);
2380 SV* const repointer = &PL_sv_undef;
2381 /* this regexp is also owned by the new PL_reg_curpm, which
2382 will try to free it. */
2383 av_push(PL_regex_padav, repointer);
2384 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2385 PL_regex_pad = AvARRAY(PL_regex_padav);
2390 /* It seems that non-ithreads works both with and without this code.
2391 So for efficiency reasons it seems best not to have the code
2392 compiled when it is not needed. */
2393 /* This is safe against NULLs: */
2394 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2395 /* PM_reg_curpm owns a reference to this regexp. */
2398 PM_SETRE(PL_reg_curpm, rx);
2399 PL_reg_oldcurpm = PL_curpm;
2400 PL_curpm = PL_reg_curpm;
2401 if (RXp_MATCH_COPIED(prog)) {
2402 /* Here is a serious problem: we cannot rewrite subbeg,
2403 since it may be needed if this match fails. Thus
2404 $` inside (?{}) could fail... */
2405 PL_reg_oldsaved = prog->subbeg;
2406 PL_reg_oldsavedlen = prog->sublen;
2407 #ifdef PERL_OLD_COPY_ON_WRITE
2408 PL_nrs = prog->saved_copy;
2410 RXp_MATCH_COPIED_off(prog);
2413 PL_reg_oldsaved = NULL;
2414 prog->subbeg = PL_bostr;
2415 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2417 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2418 prog->offs[0].start = *startpos - PL_bostr;
2419 PL_reginput = *startpos;
2420 PL_reglastparen = &prog->lastparen;
2421 PL_reglastcloseparen = &prog->lastcloseparen;
2422 prog->lastparen = 0;
2423 prog->lastcloseparen = 0;
2425 PL_regoffs = prog->offs;
2426 if (PL_reg_start_tmpl <= prog->nparens) {
2427 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2428 if(PL_reg_start_tmp)
2429 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2431 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2434 /* XXXX What this code is doing here?!!! There should be no need
2435 to do this again and again, PL_reglastparen should take care of
2438 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2439 * Actually, the code in regcppop() (which Ilya may be meaning by
2440 * PL_reglastparen), is not needed at all by the test suite
2441 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2442 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2443 * Meanwhile, this code *is* needed for the
2444 * above-mentioned test suite tests to succeed. The common theme
2445 * on those tests seems to be returning null fields from matches.
2446 * --jhi updated by dapm */
2448 if (prog->nparens) {
2449 regexp_paren_pair *pp = PL_regoffs;
2451 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2459 if (regmatch(reginfo, progi->program + 1)) {
2460 PL_regoffs[0].end = PL_reginput - PL_bostr;
2463 if (reginfo->cutpoint)
2464 *startpos= reginfo->cutpoint;
2465 REGCP_UNWIND(lastcp);
2470 #define sayYES goto yes
2471 #define sayNO goto no
2472 #define sayNO_SILENT goto no_silent
2474 /* we dont use STMT_START/END here because it leads to
2475 "unreachable code" warnings, which are bogus, but distracting. */
2476 #define CACHEsayNO \
2477 if (ST.cache_mask) \
2478 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2481 /* this is used to determine how far from the left messages like
2482 'failed...' are printed. It should be set such that messages
2483 are inline with the regop output that created them.
2485 #define REPORT_CODE_OFF 32
2488 /* Make sure there is a test for this +1 options in re_tests */
2489 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2491 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2492 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2494 #define SLAB_FIRST(s) (&(s)->states[0])
2495 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2497 /* grab a new slab and return the first slot in it */
2499 STATIC regmatch_state *
2502 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2505 regmatch_slab *s = PL_regmatch_slab->next;
2507 Newx(s, 1, regmatch_slab);
2508 s->prev = PL_regmatch_slab;
2510 PL_regmatch_slab->next = s;
2512 PL_regmatch_slab = s;
2513 return SLAB_FIRST(s);
2517 /* push a new state then goto it */
2519 #define PUSH_STATE_GOTO(state, node) \
2521 st->resume_state = state; \
2524 /* push a new state with success backtracking, then goto it */
2526 #define PUSH_YES_STATE_GOTO(state, node) \
2528 st->resume_state = state; \
2529 goto push_yes_state;
2535 regmatch() - main matching routine
2537 This is basically one big switch statement in a loop. We execute an op,
2538 set 'next' to point the next op, and continue. If we come to a point which
2539 we may need to backtrack to on failure such as (A|B|C), we push a
2540 backtrack state onto the backtrack stack. On failure, we pop the top
2541 state, and re-enter the loop at the state indicated. If there are no more
2542 states to pop, we return failure.
2544 Sometimes we also need to backtrack on success; for example /A+/, where
2545 after successfully matching one A, we need to go back and try to
2546 match another one; similarly for lookahead assertions: if the assertion
2547 completes successfully, we backtrack to the state just before the assertion
2548 and then carry on. In these cases, the pushed state is marked as
2549 'backtrack on success too'. This marking is in fact done by a chain of
2550 pointers, each pointing to the previous 'yes' state. On success, we pop to
2551 the nearest yes state, discarding any intermediate failure-only states.
2552 Sometimes a yes state is pushed just to force some cleanup code to be
2553 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2554 it to free the inner regex.
2556 Note that failure backtracking rewinds the cursor position, while
2557 success backtracking leaves it alone.
2559 A pattern is complete when the END op is executed, while a subpattern
2560 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2561 ops trigger the "pop to last yes state if any, otherwise return true"
2564 A common convention in this function is to use A and B to refer to the two
2565 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2566 the subpattern to be matched possibly multiple times, while B is the entire
2567 rest of the pattern. Variable and state names reflect this convention.
2569 The states in the main switch are the union of ops and failure/success of
2570 substates associated with with that op. For example, IFMATCH is the op
2571 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2572 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2573 successfully matched A and IFMATCH_A_fail is a state saying that we have
2574 just failed to match A. Resume states always come in pairs. The backtrack
2575 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2576 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2577 on success or failure.
2579 The struct that holds a backtracking state is actually a big union, with
2580 one variant for each major type of op. The variable st points to the
2581 top-most backtrack struct. To make the code clearer, within each
2582 block of code we #define ST to alias the relevant union.
2584 Here's a concrete example of a (vastly oversimplified) IFMATCH
2590 #define ST st->u.ifmatch
2592 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2593 ST.foo = ...; // some state we wish to save
2595 // push a yes backtrack state with a resume value of
2596 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2598 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2601 case IFMATCH_A: // we have successfully executed A; now continue with B
2603 bar = ST.foo; // do something with the preserved value
2606 case IFMATCH_A_fail: // A failed, so the assertion failed
2607 ...; // do some housekeeping, then ...
2608 sayNO; // propagate the failure
2615 For any old-timers reading this who are familiar with the old recursive
2616 approach, the code above is equivalent to:
2618 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2627 ...; // do some housekeeping, then ...
2628 sayNO; // propagate the failure
2631 The topmost backtrack state, pointed to by st, is usually free. If you
2632 want to claim it, populate any ST.foo fields in it with values you wish to
2633 save, then do one of
2635 PUSH_STATE_GOTO(resume_state, node);
2636 PUSH_YES_STATE_GOTO(resume_state, node);
2638 which sets that backtrack state's resume value to 'resume_state', pushes a
2639 new free entry to the top of the backtrack stack, then goes to 'node'.
2640 On backtracking, the free slot is popped, and the saved state becomes the
2641 new free state. An ST.foo field in this new top state can be temporarily
2642 accessed to retrieve values, but once the main loop is re-entered, it
2643 becomes available for reuse.
2645 Note that the depth of the backtrack stack constantly increases during the
2646 left-to-right execution of the pattern, rather than going up and down with
2647 the pattern nesting. For example the stack is at its maximum at Z at the
2648 end of the pattern, rather than at X in the following:
2650 /(((X)+)+)+....(Y)+....Z/
2652 The only exceptions to this are lookahead/behind assertions and the cut,
2653 (?>A), which pop all the backtrack states associated with A before
2656 Bascktrack state structs are allocated in slabs of about 4K in size.
2657 PL_regmatch_state and st always point to the currently active state,
2658 and PL_regmatch_slab points to the slab currently containing
2659 PL_regmatch_state. The first time regmatch() is called, the first slab is
2660 allocated, and is never freed until interpreter destruction. When the slab
2661 is full, a new one is allocated and chained to the end. At exit from
2662 regmatch(), slabs allocated since entry are freed.
2667 #define DEBUG_STATE_pp(pp) \
2669 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2670 PerlIO_printf(Perl_debug_log, \
2671 " %*s"pp" %s%s%s%s%s\n", \
2673 PL_reg_name[st->resume_state], \
2674 ((st==yes_state||st==mark_state) ? "[" : ""), \
2675 ((st==yes_state) ? "Y" : ""), \
2676 ((st==mark_state) ? "M" : ""), \
2677 ((st==yes_state||st==mark_state) ? "]" : "") \
2682 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2687 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2688 const char *start, const char *end, const char *blurb)
2690 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2692 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2697 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2698 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2700 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2701 start, end - start, 60);
2703 PerlIO_printf(Perl_debug_log,
2704 "%s%s REx%s %s against %s\n",
2705 PL_colors[4], blurb, PL_colors[5], s0, s1);
2707 if (do_utf8||utf8_pat)
2708 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2709 utf8_pat ? "pattern" : "",
2710 utf8_pat && do_utf8 ? " and " : "",
2711 do_utf8 ? "string" : ""
2717 S_dump_exec_pos(pTHX_ const char *locinput,
2718 const regnode *scan,
2719 const char *loc_regeol,
2720 const char *loc_bostr,
2721 const char *loc_reg_starttry,
2724 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2725 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2726 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2727 /* The part of the string before starttry has one color
2728 (pref0_len chars), between starttry and current
2729 position another one (pref_len - pref0_len chars),
2730 after the current position the third one.
2731 We assume that pref0_len <= pref_len, otherwise we
2732 decrease pref0_len. */
2733 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2734 ? (5 + taill) - l : locinput - loc_bostr;
2737 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2739 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2741 pref0_len = pref_len - (locinput - loc_reg_starttry);
2742 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2743 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2744 ? (5 + taill) - pref_len : loc_regeol - locinput);
2745 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2749 if (pref0_len > pref_len)
2750 pref0_len = pref_len;
2752 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2754 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2755 (locinput - pref_len),pref0_len, 60, 4, 5);
2757 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2758 (locinput - pref_len + pref0_len),
2759 pref_len - pref0_len, 60, 2, 3);
2761 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2762 locinput, loc_regeol - locinput, 10, 0, 1);
2764 const STRLEN tlen=len0+len1+len2;
2765 PerlIO_printf(Perl_debug_log,
2766 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2767 (IV)(locinput - loc_bostr),
2770 (docolor ? "" : "> <"),
2772 (int)(tlen > 19 ? 0 : 19 - tlen),
2779 /* reg_check_named_buff_matched()
2780 * Checks to see if a named buffer has matched. The data array of
2781 * buffer numbers corresponding to the buffer is expected to reside
2782 * in the regexp->data->data array in the slot stored in the ARG() of
2783 * node involved. Note that this routine doesn't actually care about the
2784 * name, that information is not preserved from compilation to execution.
2785 * Returns the index of the leftmost defined buffer with the given name
2786 * or 0 if non of the buffers matched.
2789 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2792 RXi_GET_DECL(rex,rexi);
2793 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2794 I32 *nums=(I32*)SvPVX(sv_dat);
2796 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2798 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2799 if ((I32)*PL_reglastparen >= nums[n] &&
2800 PL_regoffs[nums[n]].end != -1)
2809 /* free all slabs above current one - called during LEAVE_SCOPE */
2812 S_clear_backtrack_stack(pTHX_ void *p)
2814 regmatch_slab *s = PL_regmatch_slab->next;
2819 PL_regmatch_slab->next = NULL;
2821 regmatch_slab * const osl = s;
2828 #define SETREX(Re1,Re2) \
2829 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2832 STATIC I32 /* 0 failure, 1 success */
2833 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2835 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2839 register const bool do_utf8 = PL_reg_match_utf8;
2840 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2841 REGEXP *rex_sv = reginfo->prog;
2842 regexp *rex = (struct regexp *)SvANY(rex_sv);
2843 RXi_GET_DECL(rex,rexi);
2845 /* the current state. This is a cached copy of PL_regmatch_state */
2846 register regmatch_state *st;
2847 /* cache heavy used fields of st in registers */
2848 register regnode *scan;
2849 register regnode *next;
2850 register U32 n = 0; /* general value; init to avoid compiler warning */
2851 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2852 register char *locinput = PL_reginput;
2853 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2855 bool result = 0; /* return value of S_regmatch */
2856 int depth = 0; /* depth of backtrack stack */
2857 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2858 const U32 max_nochange_depth =
2859 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2860 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2861 regmatch_state *yes_state = NULL; /* state to pop to on success of
2863 /* mark_state piggy backs on the yes_state logic so that when we unwind
2864 the stack on success we can update the mark_state as we go */
2865 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2866 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2867 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2869 bool no_final = 0; /* prevent failure from backtracking? */
2870 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2871 char *startpoint = PL_reginput;
2872 SV *popmark = NULL; /* are we looking for a mark? */
2873 SV *sv_commit = NULL; /* last mark name seen in failure */
2874 SV *sv_yes_mark = NULL; /* last mark name we have seen
2875 during a successfull match */
2876 U32 lastopen = 0; /* last open we saw */
2877 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2878 SV* const oreplsv = GvSV(PL_replgv);
2879 /* these three flags are set by various ops to signal information to
2880 * the very next op. They have a useful lifetime of exactly one loop
2881 * iteration, and are not preserved or restored by state pushes/pops
2883 bool sw = 0; /* the condition value in (?(cond)a|b) */
2884 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2885 int logical = 0; /* the following EVAL is:
2889 or the following IFMATCH/UNLESSM is:
2890 false: plain (?=foo)
2891 true: used as a condition: (?(?=foo))
2894 GET_RE_DEBUG_FLAGS_DECL;
2897 PERL_ARGS_ASSERT_REGMATCH;
2899 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2900 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2902 /* on first ever call to regmatch, allocate first slab */
2903 if (!PL_regmatch_slab) {
2904 Newx(PL_regmatch_slab, 1, regmatch_slab);
2905 PL_regmatch_slab->prev = NULL;
2906 PL_regmatch_slab->next = NULL;
2907 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2910 oldsave = PL_savestack_ix;
2911 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2912 SAVEVPTR(PL_regmatch_slab);
2913 SAVEVPTR(PL_regmatch_state);
2915 /* grab next free state slot */
2916 st = ++PL_regmatch_state;
2917 if (st > SLAB_LAST(PL_regmatch_slab))
2918 st = PL_regmatch_state = S_push_slab(aTHX);
2920 /* Note that nextchr is a byte even in UTF */
2921 nextchr = UCHARAT(locinput);
2923 while (scan != NULL) {
2926 SV * const prop = sv_newmortal();
2927 regnode *rnext=regnext(scan);
2928 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2929 regprop(rex, prop, scan);
2931 PerlIO_printf(Perl_debug_log,
2932 "%3"IVdf":%*s%s(%"IVdf")\n",
2933 (IV)(scan - rexi->program), depth*2, "",
2935 (PL_regkind[OP(scan)] == END || !rnext) ?
2936 0 : (IV)(rnext - rexi->program));
2939 next = scan + NEXT_OFF(scan);
2942 state_num = OP(scan);
2946 assert(PL_reglastparen == &rex->lastparen);
2947 assert(PL_reglastcloseparen == &rex->lastcloseparen);
2948 assert(PL_regoffs == rex->offs);
2950 switch (state_num) {
2952 if (locinput == PL_bostr)
2954 /* reginfo->till = reginfo->bol; */
2959 if (locinput == PL_bostr ||
2960 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
2966 if (locinput == PL_bostr)
2970 if (locinput == reginfo->ganch)
2975 /* update the startpoint */
2976 st->u.keeper.val = PL_regoffs[0].start;
2977 PL_reginput = locinput;
2978 PL_regoffs[0].start = locinput - PL_bostr;
2979 PUSH_STATE_GOTO(KEEPS_next, next);
2981 case KEEPS_next_fail:
2982 /* rollback the start point change */
2983 PL_regoffs[0].start = st->u.keeper.val;
2989 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2994 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
2996 if (PL_regeol - locinput > 1)
3000 if (PL_regeol != locinput)
3004 if (!nextchr && locinput >= PL_regeol)
3007 locinput += PL_utf8skip[nextchr];
3008 if (locinput > PL_regeol)
3010 nextchr = UCHARAT(locinput);
3013 nextchr = UCHARAT(++locinput);
3016 if (!nextchr && locinput >= PL_regeol)
3018 nextchr = UCHARAT(++locinput);
3021 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3024 locinput += PL_utf8skip[nextchr];
3025 if (locinput > PL_regeol)
3027 nextchr = UCHARAT(locinput);
3030 nextchr = UCHARAT(++locinput);
3034 #define ST st->u.trie
3036 /* In this case the charclass data is available inline so
3037 we can fail fast without a lot of extra overhead.
3039 if (scan->flags == EXACT || !do_utf8) {
3040 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3042 PerlIO_printf(Perl_debug_log,
3043 "%*s %sfailed to match trie start class...%s\n",
3044 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3053 /* what type of TRIE am I? (utf8 makes this contextual) */
3054 DECL_TRIE_TYPE(scan);
3056 /* what trie are we using right now */
3057 reg_trie_data * const trie
3058 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3059 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3060 U32 state = trie->startstate;
3062 if (trie->bitmap && trie_type != trie_utf8_fold &&
3063 !TRIE_BITMAP_TEST(trie,*locinput)
3065 if (trie->states[ state ].wordnum) {
3067 PerlIO_printf(Perl_debug_log,
3068 "%*s %smatched empty string...%s\n",
3069 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3074 PerlIO_printf(Perl_debug_log,
3075 "%*s %sfailed to match trie start class...%s\n",
3076 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3083 U8 *uc = ( U8* )locinput;
3087 U8 *uscan = (U8*)NULL;
3089 SV *sv_accept_buff = NULL;
3090 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3092 ST.accepted = 0; /* how many accepting states we have seen */
3094 ST.jump = trie->jump;
3097 traverse the TRIE keeping track of all accepting states
3098 we transition through until we get to a failing node.
3101 while ( state && uc <= (U8*)PL_regeol ) {
3102 U32 base = trie->states[ state ].trans.base;
3105 /* We use charid to hold the wordnum as we don't use it
3106 for charid until after we have done the wordnum logic.
3107 We define an alias just so that the wordnum logic reads
3110 #define got_wordnum charid
3111 got_wordnum = trie->states[ state ].wordnum;
3113 if ( got_wordnum ) {
3114 if ( ! ST.accepted ) {
3116 SAVETMPS; /* XXX is this necessary? dmq */
3117 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3118 sv_accept_buff=newSV(bufflen *
3119 sizeof(reg_trie_accepted) - 1);
3120 SvCUR_set(sv_accept_buff, 0);
3121 SvPOK_on(sv_accept_buff);
3122 sv_2mortal(sv_accept_buff);
3125 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3128 if (ST.accepted >= bufflen) {
3130 ST.accept_buff =(reg_trie_accepted*)
3131 SvGROW(sv_accept_buff,
3132 bufflen * sizeof(reg_trie_accepted));
3134 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3135 + sizeof(reg_trie_accepted));
3138 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3139 ST.accept_buff[ST.accepted].endpos = uc;
3141 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3145 DEBUG_TRIE_EXECUTE_r({
3146 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3147 PerlIO_printf( Perl_debug_log,
3148 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
3149 2+depth * 2, "", PL_colors[4],
3150 (UV)state, (UV)ST.accepted );
3154 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3155 uscan, len, uvc, charid, foldlen,
3159 (base + charid > trie->uniquecharcount )
3160 && (base + charid - 1 - trie->uniquecharcount
3162 && trie->trans[base + charid - 1 -
3163 trie->uniquecharcount].check == state)
3165 state = trie->trans[base + charid - 1 -
3166 trie->uniquecharcount ].next;
3177 DEBUG_TRIE_EXECUTE_r(
3178 PerlIO_printf( Perl_debug_log,
3179 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3180 charid, uvc, (UV)state, PL_colors[5] );
3187 PerlIO_printf( Perl_debug_log,
3188 "%*s %sgot %"IVdf" possible matches%s\n",
3189 REPORT_CODE_OFF + depth * 2, "",
3190 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3193 goto trie_first_try; /* jump into the fail handler */
3195 case TRIE_next_fail: /* we failed - try next alterative */
3197 REGCP_UNWIND(ST.cp);
3198 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3199 PL_regoffs[n].end = -1;
3200 *PL_reglastparen = n;
3209 ST.lastparen = *PL_reglastparen;
3212 if ( ST.accepted == 1 ) {
3213 /* only one choice left - just continue */
3215 AV *const trie_words
3216 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3217 SV ** const tmp = av_fetch( trie_words,
3218 ST.accept_buff[ 0 ].wordnum-1, 0 );
3219 SV *sv= tmp ? sv_newmortal() : NULL;
3221 PerlIO_printf( Perl_debug_log,
3222 "%*s %sonly one match left: #%d <%s>%s\n",
3223 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3224 ST.accept_buff[ 0 ].wordnum,
3225 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3226 PL_colors[0], PL_colors[1],
3227 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3229 : "not compiled under -Dr",
3232 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3233 /* in this case we free tmps/leave before we call regmatch
3234 as we wont be using accept_buff again. */
3236 locinput = PL_reginput;
3237 nextchr = UCHARAT(locinput);
3238 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3241 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3242 if (!has_cutgroup) {
3247 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3250 continue; /* execute rest of RE */
3253 if ( !ST.accepted-- ) {
3255 PerlIO_printf( Perl_debug_log,
3256 "%*s %sTRIE failed...%s\n",
3257 REPORT_CODE_OFF+depth*2, "",
3268 There are at least two accepting states left. Presumably
3269 the number of accepting states is going to be low,
3270 typically two. So we simply scan through to find the one
3271 with lowest wordnum. Once we find it, we swap the last
3272 state into its place and decrement the size. We then try to
3273 match the rest of the pattern at the point where the word
3274 ends. If we succeed, control just continues along the
3275 regex; if we fail we return here to try the next accepting
3282 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3283 DEBUG_TRIE_EXECUTE_r(
3284 PerlIO_printf( Perl_debug_log,
3285 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3286 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3287 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3288 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3291 if (ST.accept_buff[cur].wordnum <
3292 ST.accept_buff[best].wordnum)
3297 AV *const trie_words
3298 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3299 SV ** const tmp = av_fetch( trie_words,
3300 ST.accept_buff[ best ].wordnum - 1, 0 );
3301 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3303 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3304 SV *sv= tmp ? sv_newmortal() : NULL;
3306 PerlIO_printf( Perl_debug_log,
3307 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3308 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3309 ST.accept_buff[best].wordnum,
3310 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3311 PL_colors[0], PL_colors[1],
3312 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3313 ) : "not compiled under -Dr",
3314 REG_NODE_NUM(nextop),
3318 if ( best<ST.accepted ) {
3319 reg_trie_accepted tmp = ST.accept_buff[ best ];
3320 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3321 ST.accept_buff[ ST.accepted ] = tmp;
3324 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3325 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3328 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3330 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3335 /* we dont want to throw this away, see bug 57042*/
3336 if (oreplsv != GvSV(PL_replgv))
3337 sv_setsv(oreplsv, GvSV(PL_replgv));
3344 char *s = STRING(scan);
3346 if (do_utf8 != UTF) {
3347 /* The target and the pattern have differing utf8ness. */
3349 const char * const e = s + ln;
3352 /* The target is utf8, the pattern is not utf8. */
3357 if (NATIVE_TO_UNI(*(U8*)s) !=
3358 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3366 /* The target is not utf8, the pattern is utf8. */
3371 if (NATIVE_TO_UNI(*((U8*)l)) !=
3372 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3380 nextchr = UCHARAT(locinput);
3383 /* The target and the pattern have the same utf8ness. */
3384 /* Inline the first character, for speed. */
3385 if (UCHARAT(s) != nextchr)
3387 if (PL_regeol - locinput < ln)
3389 if (ln > 1 && memNE(s, locinput, ln))
3392 nextchr = UCHARAT(locinput);
3396 PL_reg_flags |= RF_tainted;
3399 char * const s = STRING(scan);
3402 if (do_utf8 || UTF) {
3403 /* Either target or the pattern are utf8. */
3404 const char * const l = locinput;
3405 char *e = PL_regeol;
3407 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3408 l, &e, 0, do_utf8)) {
3409 /* One more case for the sharp s:
3410 * pack("U0U*", 0xDF) =~ /ss/i,
3411 * the 0xC3 0x9F are the UTF-8
3412 * byte sequence for the U+00DF. */
3415 toLOWER(s[0]) == 's' &&
3417 toLOWER(s[1]) == 's' &&
3424 nextchr = UCHARAT(locinput);
3428 /* Neither the target and the pattern are utf8. */
3430 /* Inline the first character, for speed. */
3431 if (UCHARAT(s) != nextchr &&
3432 UCHARAT(s) != ((OP(scan) == EXACTF)
3433 ? PL_fold : PL_fold_locale)[nextchr])
3435 if (PL_regeol - locinput < ln)
3437 if (ln > 1 && (OP(scan) == EXACTF
3438 ? ibcmp(s, locinput, ln)
3439 : ibcmp_locale(s, locinput, ln)))
3442 nextchr = UCHARAT(locinput);
3447 PL_reg_flags |= RF_tainted;
3451 /* was last char in word? */
3453 if (locinput == PL_bostr)
3456 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3458 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3460 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3461 ln = isALNUM_uni(ln);
3462 LOAD_UTF8_CHARCLASS_ALNUM();
3463 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3466 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3467 n = isALNUM_LC_utf8((U8*)locinput);
3471 ln = (locinput != PL_bostr) ?
3472 UCHARAT(locinput - 1) : '\n';
3473 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3475 n = isALNUM(nextchr);
3478 ln = isALNUM_LC(ln);
3479 n = isALNUM_LC(nextchr);
3482 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3483 OP(scan) == BOUNDL))
3488 STRLEN inclasslen = PL_regeol - locinput;
3490 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3492 if (locinput >= PL_regeol)
3494 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3495 nextchr = UCHARAT(locinput);
3500 nextchr = UCHARAT(locinput);
3501 if (!REGINCLASS(rex, scan, (U8*)locinput))
3503 if (!nextchr && locinput >= PL_regeol)
3505 nextchr = UCHARAT(++locinput);
3509 /* If we might have the case of the German sharp s
3510 * in a casefolding Unicode character class. */
3512 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3513 locinput += SHARP_S_SKIP;
3514 nextchr = UCHARAT(locinput);
3519 /* Special char classes - The defines start on line 129 or so */
3520 CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3521 CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3523 CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3524 CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3526 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3527 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3530 if (locinput >= PL_regeol)
3533 LOAD_UTF8_CHARCLASS_MARK();
3534 if (swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3536 locinput += PL_utf8skip[nextchr];
3537 while (locinput < PL_regeol &&
3538 swash_fetch(PL_utf8_mark,(U8*)locinput, do_utf8))
3539 locinput += UTF8SKIP(locinput);
3540 if (locinput > PL_regeol)
3545 nextchr = UCHARAT(locinput);
3552 PL_reg_flags |= RF_tainted;
3557 n = reg_check_named_buff_matched(rex,scan);
3560 type = REF + ( type - NREF );
3567 PL_reg_flags |= RF_tainted;
3571 n = ARG(scan); /* which paren pair */
3574 ln = PL_regoffs[n].start;
3575 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3576 if (*PL_reglastparen < n || ln == -1)
3577 sayNO; /* Do not match unless seen CLOSEn. */
3578 if (ln == PL_regoffs[n].end)
3582 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3584 const char *e = PL_bostr + PL_regoffs[n].end;
3586 * Note that we can't do the "other character" lookup trick as
3587 * in the 8-bit case (no pun intended) because in Unicode we
3588 * have to map both upper and title case to lower case.
3592 STRLEN ulen1, ulen2;
3593 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3594 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3598 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3599 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3600 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3607 nextchr = UCHARAT(locinput);
3611 /* Inline the first character, for speed. */
3612 if (UCHARAT(s) != nextchr &&
3614 (UCHARAT(s) != (type == REFF
3615 ? PL_fold : PL_fold_locale)[nextchr])))
3617 ln = PL_regoffs[n].end - ln;
3618 if (locinput + ln > PL_regeol)
3620 if (ln > 1 && (type == REF
3621 ? memNE(s, locinput, ln)
3623 ? ibcmp(s, locinput, ln)
3624 : ibcmp_locale(s, locinput, ln))))
3627 nextchr = UCHARAT(locinput);
3637 #define ST st->u.eval
3642 regexp_internal *rei;
3643 regnode *startpoint;
3646 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3647 if (cur_eval && cur_eval->locinput==locinput) {
3648 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3649 Perl_croak(aTHX_ "Infinite recursion in regex");
3650 if ( ++nochange_depth > max_nochange_depth )
3652 "Pattern subroutine nesting without pos change"
3653 " exceeded limit in regex");
3660 (void)ReREFCNT_inc(rex_sv);
3661 if (OP(scan)==GOSUB) {
3662 startpoint = scan + ARG2L(scan);
3663 ST.close_paren = ARG(scan);
3665 startpoint = rei->program+1;
3668 goto eval_recurse_doit;
3670 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3671 if (cur_eval && cur_eval->locinput==locinput) {
3672 if ( ++nochange_depth > max_nochange_depth )
3673 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3678 /* execute the code in the {...} */
3680 SV ** const before = SP;
3681 OP_4tree * const oop = PL_op;
3682 COP * const ocurcop = PL_curcop;
3684 char *saved_regeol = PL_regeol;
3687 PL_op = (OP_4tree*)rexi->data->data[n];
3688 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3689 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3690 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3691 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3694 SV *sv_mrk = get_sv("REGMARK", 1);
3695 sv_setsv(sv_mrk, sv_yes_mark);
3698 CALLRUNOPS(aTHX); /* Scalar context. */
3701 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3708 PAD_RESTORE_LOCAL(old_comppad);
3709 PL_curcop = ocurcop;
3710 PL_regeol = saved_regeol;
3713 sv_setsv(save_scalar(PL_replgv), ret);
3717 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3720 /* extract RE object from returned value; compiling if
3726 SV *const sv = SvRV(ret);
3728 if (SvTYPE(sv) == SVt_REGEXP) {
3730 } else if (SvSMAGICAL(sv)) {
3731 mg = mg_find(sv, PERL_MAGIC_qr);
3734 } else if (SvTYPE(ret) == SVt_REGEXP) {
3736 } else if (SvSMAGICAL(ret)) {
3737 if (SvGMAGICAL(ret)) {
3738 /* I don't believe that there is ever qr magic
3740 assert(!mg_find(ret, PERL_MAGIC_qr));
3741 sv_unmagic(ret, PERL_MAGIC_qr);
3744 mg = mg_find(ret, PERL_MAGIC_qr);
3745 /* testing suggests mg only ends up non-NULL for
3746 scalars who were upgraded and compiled in the
3747 else block below. In turn, this is only
3748 triggered in the "postponed utf8 string" tests
3754 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
3758 rx = reg_temp_copy(rx);
3762 const I32 osize = PL_regsize;
3765 assert (SvUTF8(ret));
3766 } else if (SvUTF8(ret)) {
3767 /* Not doing UTF-8, despite what the SV says. Is
3768 this only if we're trapped in use 'bytes'? */
3769 /* Make a copy of the octet sequence, but without
3770 the flag on, as the compiler now honours the
3771 SvUTF8 flag on ret. */
3773 const char *const p = SvPV(ret, len);
3774 ret = newSVpvn_flags(p, len, SVs_TEMP);
3776 rx = CALLREGCOMP(ret, pm_flags);
3778 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
3780 /* This isn't a first class regexp. Instead, it's
3781 caching a regexp onto an existing, Perl visible
3783 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
3788 re = (struct regexp *)SvANY(rx);
3790 RXp_MATCH_COPIED_off(re);
3791 re->subbeg = rex->subbeg;
3792 re->sublen = rex->sublen;
3795 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
3796 "Matching embedded");
3798 startpoint = rei->program + 1;
3799 ST.close_paren = 0; /* only used for GOSUB */
3800 /* borrowed from regtry */
3801 if (PL_reg_start_tmpl <= re->nparens) {
3802 PL_reg_start_tmpl = re->nparens*3/2 + 3;
3803 if(PL_reg_start_tmp)
3804 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3806 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
3809 eval_recurse_doit: /* Share code with GOSUB below this line */
3810 /* run the pattern returned from (??{...}) */
3811 ST.cp = regcppush(0); /* Save *all* the positions. */
3812 REGCP_SET(ST.lastcp);
3814 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
3816 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
3817 PL_reglastparen = &re->lastparen;
3818 PL_reglastcloseparen = &re->lastcloseparen;
3820 re->lastcloseparen = 0;
3822 PL_reginput = locinput;
3825 /* XXXX This is too dramatic a measure... */
3828 ST.toggle_reg_flags = PL_reg_flags;
3830 PL_reg_flags |= RF_utf8;
3832 PL_reg_flags &= ~RF_utf8;
3833 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
3835 ST.prev_rex = rex_sv;
3836 ST.prev_curlyx = cur_curlyx;
3837 SETREX(rex_sv,re_sv);
3842 ST.prev_eval = cur_eval;
3844 /* now continue from first node in postoned RE */
3845 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
3848 /* logical is 1, /(?(?{...})X|Y)/ */
3849 sw = (bool)SvTRUE(ret);
3854 case EVAL_AB: /* cleanup after a successful (??{A})B */
3855 /* note: this is called twice; first after popping B, then A */
3856 PL_reg_flags ^= ST.toggle_reg_flags;
3857 ReREFCNT_dec(rex_sv);
3858 SETREX(rex_sv,ST.prev_rex);
3859 rex = (struct regexp *)SvANY(rex_sv);
3860 rexi = RXi_GET(rex);
3862 cur_eval = ST.prev_eval;
3863 cur_curlyx = ST.prev_curlyx;
3865 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
3866 PL_reglastparen = &rex->lastparen;
3867 PL_reglastcloseparen = &rex->lastcloseparen;
3868 /* also update PL_regoffs */
3869 PL_regoffs = rex->offs;
3871 /* XXXX This is too dramatic a measure... */
3873 if ( nochange_depth )
3878 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
3879 /* note: this is called twice; first after popping B, then A */
3880 PL_reg_flags ^= ST.toggle_reg_flags;
3881 ReREFCNT_dec(rex_sv);
3882 SETREX(rex_sv,ST.prev_rex);
3883 rex = (struct regexp *)SvANY(rex_sv);
3884 rexi = RXi_GET(rex);
3885 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
3886 PL_reglastparen = &rex->lastparen;
3887 PL_reglastcloseparen = &rex->lastcloseparen;
3889 PL_reginput = locinput;
3890 REGCP_UNWIND(ST.lastcp);
3892 cur_eval = ST.prev_eval;
3893 cur_curlyx = ST.prev_curlyx;
3894 /* XXXX This is too dramatic a measure... */
3896 if ( nochange_depth )
3902 n = ARG(scan); /* which paren pair */
3903 PL_reg_start_tmp[n] = locinput;
3909 n = ARG(scan); /* which paren pair */
3910 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
3911 PL_regoffs[n].end = locinput - PL_bostr;
3912 /*if (n > PL_regsize)
3914 if (n > *PL_reglastparen)
3915 *PL_reglastparen = n;
3916 *PL_reglastcloseparen = n;
3917 if (cur_eval && cur_eval->u.eval.close_paren == n) {
3925 cursor && OP(cursor)!=END;
3926 cursor=regnext(cursor))
3928 if ( OP(cursor)==CLOSE ){
3930 if ( n <= lastopen ) {
3932 = PL_reg_start_tmp[n] - PL_bostr;
3933 PL_regoffs[n].end = locinput - PL_bostr;
3934 /*if (n > PL_regsize)
3936 if (n > *PL_reglastparen)
3937 *PL_reglastparen = n;
3938 *PL_reglastcloseparen = n;
3939 if ( n == ARG(scan) || (cur_eval &&
3940 cur_eval->u.eval.close_paren == n))
3949 n = ARG(scan); /* which paren pair */
3950 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
3953 /* reg_check_named_buff_matched returns 0 for no match */
3954 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
3958 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
3964 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3966 next = NEXTOPER(NEXTOPER(scan));
3968 next = scan + ARG(scan);
3969 if (OP(next) == IFTHEN) /* Fake one. */
3970 next = NEXTOPER(NEXTOPER(next));
3974 logical = scan->flags;
3977 /*******************************************************************
3979 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
3980 pattern, where A and B are subpatterns. (For simple A, CURLYM or
3981 STAR/PLUS/CURLY/CURLYN are used instead.)
3983 A*B is compiled as <CURLYX><A><WHILEM><B>
3985 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
3986 state, which contains the current count, initialised to -1. It also sets
3987 cur_curlyx to point to this state, with any previous value saved in the
3990 CURLYX then jumps straight to the WHILEM op, rather than executing A,
3991 since the pattern may possibly match zero times (i.e. it's a while {} loop
3992 rather than a do {} while loop).
3994 Each entry to WHILEM represents a successful match of A. The count in the
3995 CURLYX block is incremented, another WHILEM state is pushed, and execution
3996 passes to A or B depending on greediness and the current count.
3998 For example, if matching against the string a1a2a3b (where the aN are
3999 substrings that match /A/), then the match progresses as follows: (the
4000 pushed states are interspersed with the bits of strings matched so far):
4003 <CURLYX cnt=0><WHILEM>
4004 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4005 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4006 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4007 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4009 (Contrast this with something like CURLYM, which maintains only a single
4013 a1 <CURLYM cnt=1> a2
4014 a1 a2 <CURLYM cnt=2> a3
4015 a1 a2 a3 <CURLYM cnt=3> b
4018 Each WHILEM state block marks a point to backtrack to upon partial failure
4019 of A or B, and also contains some minor state data related to that
4020 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4021 overall state, such as the count, and pointers to the A and B ops.
4023 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4024 must always point to the *current* CURLYX block, the rules are:
4026 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4027 and set cur_curlyx to point the new block.
4029 When popping the CURLYX block after a successful or unsuccessful match,
4030 restore the previous cur_curlyx.
4032 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4033 to the outer one saved in the CURLYX block.
4035 When popping the WHILEM block after a successful or unsuccessful B match,
4036 restore the previous cur_curlyx.
4038 Here's an example for the pattern (AI* BI)*BO
4039 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4042 curlyx backtrack stack
4043 ------ ---------------
4045 CO <CO prev=NULL> <WO>
4046 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4047 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4048 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4050 At this point the pattern succeeds, and we work back down the stack to
4051 clean up, restoring as we go:
4053 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4054 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4055 CO <CO prev=NULL> <WO>
4058 *******************************************************************/
4060 #define ST st->u.curlyx
4062 case CURLYX: /* start of /A*B/ (for complex A) */
4064 /* No need to save/restore up to this paren */
4065 I32 parenfloor = scan->flags;
4067 assert(next); /* keep Coverity happy */
4068 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4071 /* XXXX Probably it is better to teach regpush to support
4072 parenfloor > PL_regsize... */
4073 if (parenfloor > (I32)*PL_reglastparen)
4074 parenfloor = *PL_reglastparen; /* Pessimization... */
4076 ST.prev_curlyx= cur_curlyx;
4078 ST.cp = PL_savestack_ix;
4080 /* these fields contain the state of the current curly.
4081 * they are accessed by subsequent WHILEMs */
4082 ST.parenfloor = parenfloor;
4083 ST.min = ARG1(scan);
4084 ST.max = ARG2(scan);
4085 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4089 ST.count = -1; /* this will be updated by WHILEM */
4090 ST.lastloc = NULL; /* this will be updated by WHILEM */
4092 PL_reginput = locinput;
4093 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4097 case CURLYX_end: /* just finished matching all of A*B */
4098 cur_curlyx = ST.prev_curlyx;
4102 case CURLYX_end_fail: /* just failed to match all of A*B */
4104 cur_curlyx = ST.prev_curlyx;
4110 #define ST st->u.whilem
4112 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4114 /* see the discussion above about CURLYX/WHILEM */
4116 assert(cur_curlyx); /* keep Coverity happy */
4117 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4118 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4119 ST.cache_offset = 0;
4122 PL_reginput = locinput;
4124 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4125 "%*s whilem: matched %ld out of %ld..%ld\n",
4126 REPORT_CODE_OFF+depth*2, "", (long)n,
4127 (long)cur_curlyx->u.curlyx.min,
4128 (long)cur_curlyx->u.curlyx.max)
4131 /* First just match a string of min A's. */
4133 if (n < cur_curlyx->u.curlyx.min) {
4134 cur_curlyx->u.curlyx.lastloc = locinput;
4135 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4139 /* If degenerate A matches "", assume A done. */
4141 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4142 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4143 "%*s whilem: empty match detected, trying continuation...\n",
4144 REPORT_CODE_OFF+depth*2, "")
4146 goto do_whilem_B_max;
4149 /* super-linear cache processing */
4153 if (!PL_reg_maxiter) {
4154 /* start the countdown: Postpone detection until we
4155 * know the match is not *that* much linear. */
4156 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4157 /* possible overflow for long strings and many CURLYX's */
4158 if (PL_reg_maxiter < 0)
4159 PL_reg_maxiter = I32_MAX;
4160 PL_reg_leftiter = PL_reg_maxiter;
4163 if (PL_reg_leftiter-- == 0) {
4164 /* initialise cache */
4165 const I32 size = (PL_reg_maxiter + 7)/8;
4166 if (PL_reg_poscache) {
4167 if ((I32)PL_reg_poscache_size < size) {
4168 Renew(PL_reg_poscache, size, char);
4169 PL_reg_poscache_size = size;
4171 Zero(PL_reg_poscache, size, char);
4174 PL_reg_poscache_size = size;
4175 Newxz(PL_reg_poscache, size, char);
4177 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4178 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4179 PL_colors[4], PL_colors[5])
4183 if (PL_reg_leftiter < 0) {
4184 /* have we already failed at this position? */
4186 offset = (scan->flags & 0xf) - 1
4187 + (locinput - PL_bostr) * (scan->flags>>4);
4188 mask = 1 << (offset % 8);
4190 if (PL_reg_poscache[offset] & mask) {
4191 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4192 "%*s whilem: (cache) already tried at this position...\n",
4193 REPORT_CODE_OFF+depth*2, "")
4195 sayNO; /* cache records failure */
4197 ST.cache_offset = offset;
4198 ST.cache_mask = mask;
4202 /* Prefer B over A for minimal matching. */
4204 if (cur_curlyx->u.curlyx.minmod) {
4205 ST.save_curlyx = cur_curlyx;
4206 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4207 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4208 REGCP_SET(ST.lastcp);
4209 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4213 /* Prefer A over B for maximal matching. */
4215 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4216 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4217 cur_curlyx->u.curlyx.lastloc = locinput;
4218 REGCP_SET(ST.lastcp);
4219 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4222 goto do_whilem_B_max;
4226 case WHILEM_B_min: /* just matched B in a minimal match */
4227 case WHILEM_B_max: /* just matched B in a maximal match */
4228 cur_curlyx = ST.save_curlyx;
4232 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4233 cur_curlyx = ST.save_curlyx;
4234 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4235 cur_curlyx->u.curlyx.count--;
4239 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4240 REGCP_UNWIND(ST.lastcp);
4243 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4244 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4245 cur_curlyx->u.curlyx.count--;
4249 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4250 REGCP_UNWIND(ST.lastcp);
4251 regcppop(rex); /* Restore some previous $<digit>s? */
4252 PL_reginput = locinput;
4253 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4254 "%*s whilem: failed, trying continuation...\n",
4255 REPORT_CODE_OFF+depth*2, "")
4258 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4259 && ckWARN(WARN_REGEXP)
4260 && !(PL_reg_flags & RF_warned))
4262 PL_reg_flags |= RF_warned;
4263 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4264 "Complex regular subexpression recursion",
4269 ST.save_curlyx = cur_curlyx;
4270 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4271 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4274 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4275 cur_curlyx = ST.save_curlyx;
4276 REGCP_UNWIND(ST.lastcp);
4279 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4280 /* Maximum greed exceeded */
4281 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4282 && ckWARN(WARN_REGEXP)
4283 && !(PL_reg_flags & RF_warned))
4285 PL_reg_flags |= RF_warned;
4286 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4287 "%s limit (%d) exceeded",
4288 "Complex regular subexpression recursion",
4291 cur_curlyx->u.curlyx.count--;
4295 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4296 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4298 /* Try grabbing another A and see if it helps. */
4299 PL_reginput = locinput;
4300 cur_curlyx->u.curlyx.lastloc = locinput;
4301 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4302 REGCP_SET(ST.lastcp);
4303 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4307 #define ST st->u.branch
4309 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4310 next = scan + ARG(scan);
4313 scan = NEXTOPER(scan);
4316 case BRANCH: /* /(...|A|...)/ */
4317 scan = NEXTOPER(scan); /* scan now points to inner node */
4318 ST.lastparen = *PL_reglastparen;
4319 ST.next_branch = next;
4321 PL_reginput = locinput;
4323 /* Now go into the branch */
4325 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4327 PUSH_STATE_GOTO(BRANCH_next, scan);
4331 PL_reginput = locinput;
4332 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4333 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4334 PUSH_STATE_GOTO(CUTGROUP_next,next);
4336 case CUTGROUP_next_fail:
4339 if (st->u.mark.mark_name)
4340 sv_commit = st->u.mark.mark_name;
4346 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4351 REGCP_UNWIND(ST.cp);
4352 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4353 PL_regoffs[n].end = -1;
4354 *PL_reglastparen = n;
4355 /*dmq: *PL_reglastcloseparen = n; */
4356 scan = ST.next_branch;
4357 /* no more branches? */
4358 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4360 PerlIO_printf( Perl_debug_log,
4361 "%*s %sBRANCH failed...%s\n",
4362 REPORT_CODE_OFF+depth*2, "",
4368 continue; /* execute next BRANCH[J] op */
4376 #define ST st->u.curlym
4378 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4380 /* This is an optimisation of CURLYX that enables us to push
4381 * only a single backtracking state, no matter how many matches
4382 * there are in {m,n}. It relies on the pattern being constant
4383 * length, with no parens to influence future backrefs
4387 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4389 /* if paren positive, emulate an OPEN/CLOSE around A */
4391 U32 paren = ST.me->flags;
4392 if (paren > PL_regsize)
4394 if (paren > *PL_reglastparen)
4395 *PL_reglastparen = paren;
4396 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4404 ST.c1 = CHRTEST_UNINIT;
4407 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4410 curlym_do_A: /* execute the A in /A{m,n}B/ */
4411 PL_reginput = locinput;
4412 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4415 case CURLYM_A: /* we've just matched an A */
4416 locinput = st->locinput;
4417 nextchr = UCHARAT(locinput);
4420 /* after first match, determine A's length: u.curlym.alen */
4421 if (ST.count == 1) {
4422 if (PL_reg_match_utf8) {
4424 while (s < PL_reginput) {
4430 ST.alen = PL_reginput - locinput;
4433 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4436 PerlIO_printf(Perl_debug_log,
4437 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4438 (int)(REPORT_CODE_OFF+(depth*2)), "",
4439 (IV) ST.count, (IV)ST.alen)
4442 locinput = PL_reginput;
4444 if (cur_eval && cur_eval->u.eval.close_paren &&
4445 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4449 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4450 if ( max == REG_INFTY || ST.count < max )
4451 goto curlym_do_A; /* try to match another A */
4453 goto curlym_do_B; /* try to match B */
4455 case CURLYM_A_fail: /* just failed to match an A */
4456 REGCP_UNWIND(ST.cp);
4458 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4459 || (cur_eval && cur_eval->u.eval.close_paren &&
4460 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4463 curlym_do_B: /* execute the B in /A{m,n}B/ */
4464 PL_reginput = locinput;
4465 if (ST.c1 == CHRTEST_UNINIT) {
4466 /* calculate c1 and c2 for possible match of 1st char
4467 * following curly */
4468 ST.c1 = ST.c2 = CHRTEST_VOID;
4469 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4470 regnode *text_node = ST.B;
4471 if (! HAS_TEXT(text_node))
4472 FIND_NEXT_IMPT(text_node);
4475 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4477 But the former is redundant in light of the latter.
4479 if this changes back then the macro for
4480 IS_TEXT and friends need to change.
4482 if (PL_regkind[OP(text_node)] == EXACT)
4485 ST.c1 = (U8)*STRING(text_node);
4487 (IS_TEXTF(text_node))
4489 : (IS_TEXTFL(text_node))
4490 ? PL_fold_locale[ST.c1]
4497 PerlIO_printf(Perl_debug_log,
4498 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4499 (int)(REPORT_CODE_OFF+(depth*2)),
4502 if (ST.c1 != CHRTEST_VOID
4503 && UCHARAT(PL_reginput) != ST.c1
4504 && UCHARAT(PL_reginput) != ST.c2)
4506 /* simulate B failing */
4508 PerlIO_printf(Perl_debug_log,
4509 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4510 (int)(REPORT_CODE_OFF+(depth*2)),"",
4513 state_num = CURLYM_B_fail;
4514 goto reenter_switch;
4518 /* mark current A as captured */
4519 I32 paren = ST.me->flags;
4521 PL_regoffs[paren].start
4522 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4523 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4524 /*dmq: *PL_reglastcloseparen = paren; */
4527 PL_regoffs[paren].end = -1;
4528 if (cur_eval && cur_eval->u.eval.close_paren &&
4529 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4538 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4541 case CURLYM_B_fail: /* just failed to match a B */
4542 REGCP_UNWIND(ST.cp);
4544 I32 max = ARG2(ST.me);
4545 if (max != REG_INFTY && ST.count == max)
4547 goto curlym_do_A; /* try to match a further A */
4549 /* backtrack one A */
4550 if (ST.count == ARG1(ST.me) /* min */)
4553 locinput = HOPc(locinput, -ST.alen);
4554 goto curlym_do_B; /* try to match B */
4557 #define ST st->u.curly
4559 #define CURLY_SETPAREN(paren, success) \
4562 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4563 PL_regoffs[paren].end = locinput - PL_bostr; \
4564 *PL_reglastcloseparen = paren; \
4567 PL_regoffs[paren].end = -1; \
4570 case STAR: /* /A*B/ where A is width 1 */
4574 scan = NEXTOPER(scan);
4576 case PLUS: /* /A+B/ where A is width 1 */
4580 scan = NEXTOPER(scan);
4582 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4583 ST.paren = scan->flags; /* Which paren to set */
4584 if (ST.paren > PL_regsize)
4585 PL_regsize = ST.paren;
4586 if (ST.paren > *PL_reglastparen)
4587 *PL_reglastparen = ST.paren;
4588 ST.min = ARG1(scan); /* min to match */
4589 ST.max = ARG2(scan); /* max to match */
4590 if (cur_eval && cur_eval->u.eval.close_paren &&
4591 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4595 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4597 case CURLY: /* /A{m,n}B/ where A is width 1 */
4599 ST.min = ARG1(scan); /* min to match */
4600 ST.max = ARG2(scan); /* max to match */
4601 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4604 * Lookahead to avoid useless match attempts
4605 * when we know what character comes next.
4607 * Used to only do .*x and .*?x, but now it allows
4608 * for )'s, ('s and (?{ ... })'s to be in the way
4609 * of the quantifier and the EXACT-like node. -- japhy
4612 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4614 if (HAS_TEXT(next) || JUMPABLE(next)) {
4616 regnode *text_node = next;
4618 if (! HAS_TEXT(text_node))
4619 FIND_NEXT_IMPT(text_node);
4621 if (! HAS_TEXT(text_node))
4622 ST.c1 = ST.c2 = CHRTEST_VOID;
4624 if ( PL_regkind[OP(text_node)] != EXACT ) {
4625 ST.c1 = ST.c2 = CHRTEST_VOID;
4626 goto assume_ok_easy;
4629 s = (U8*)STRING(text_node);
4631 /* Currently we only get here when
4633 PL_rekind[OP(text_node)] == EXACT
4635 if this changes back then the macro for IS_TEXT and
4636 friends need to change. */
4639 if (IS_TEXTF(text_node))
4640 ST.c2 = PL_fold[ST.c1];
4641 else if (IS_TEXTFL(text_node))
4642 ST.c2 = PL_fold_locale[ST.c1];
4645 if (IS_TEXTF(text_node)) {
4646 STRLEN ulen1, ulen2;
4647 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4648 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4650 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4651 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4653 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4655 0 : UTF8_ALLOW_ANY);
4656 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4658 0 : UTF8_ALLOW_ANY);
4660 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4662 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4667 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4674 ST.c1 = ST.c2 = CHRTEST_VOID;
4679 PL_reginput = locinput;
4682 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4685 locinput = PL_reginput;
4687 if (ST.c1 == CHRTEST_VOID)
4688 goto curly_try_B_min;
4690 ST.oldloc = locinput;
4692 /* set ST.maxpos to the furthest point along the
4693 * string that could possibly match */
4694 if (ST.max == REG_INFTY) {
4695 ST.maxpos = PL_regeol - 1;
4697 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4701 int m = ST.max - ST.min;
4702 for (ST.maxpos = locinput;
4703 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4704 ST.maxpos += UTF8SKIP(ST.maxpos);
4707 ST.maxpos = locinput + ST.max - ST.min;
4708 if (ST.maxpos >= PL_regeol)
4709 ST.maxpos = PL_regeol - 1;
4711 goto curly_try_B_min_known;
4715 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4716 locinput = PL_reginput;
4717 if (ST.count < ST.min)
4719 if ((ST.count > ST.min)
4720 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4722 /* A{m,n} must come at the end of the string, there's
4723 * no point in backing off ... */
4725 /* ...except that $ and \Z can match before *and* after
4726 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4727 We may back off by one in this case. */
4728 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4732 goto curly_try_B_max;
4737 case CURLY_B_min_known_fail:
4738 /* failed to find B in a non-greedy match where c1,c2 valid */
4739 if (ST.paren && ST.count)
4740 PL_regoffs[ST.paren].end = -1;
4742 PL_reginput = locinput; /* Could be reset... */
4743 REGCP_UNWIND(ST.cp);
4744 /* Couldn't or didn't -- move forward. */
4745 ST.oldloc = locinput;
4747 locinput += UTF8SKIP(locinput);
4751 curly_try_B_min_known:
4752 /* find the next place where 'B' could work, then call B */
4756 n = (ST.oldloc == locinput) ? 0 : 1;
4757 if (ST.c1 == ST.c2) {
4759 /* set n to utf8_distance(oldloc, locinput) */
4760 while (locinput <= ST.maxpos &&
4761 utf8n_to_uvchr((U8*)locinput,
4762 UTF8_MAXBYTES, &len,
4763 uniflags) != (UV)ST.c1) {
4769 /* set n to utf8_distance(oldloc, locinput) */
4770 while (locinput <= ST.maxpos) {
4772 const UV c = utf8n_to_uvchr((U8*)locinput,
4773 UTF8_MAXBYTES, &len,
4775 if (c == (UV)ST.c1 || c == (UV)ST.c2)
4783 if (ST.c1 == ST.c2) {
4784 while (locinput <= ST.maxpos &&
4785 UCHARAT(locinput) != ST.c1)
4789 while (locinput <= ST.maxpos
4790 && UCHARAT(locinput) != ST.c1
4791 && UCHARAT(locinput) != ST.c2)
4794 n = locinput - ST.oldloc;
4796 if (locinput > ST.maxpos)
4798 /* PL_reginput == oldloc now */
4801 if (regrepeat(rex, ST.A, n, depth) < n)
4804 PL_reginput = locinput;
4805 CURLY_SETPAREN(ST.paren, ST.count);
4806 if (cur_eval && cur_eval->u.eval.close_paren &&
4807 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4810 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
4815 case CURLY_B_min_fail:
4816 /* failed to find B in a non-greedy match where c1,c2 invalid */
4817 if (ST.paren && ST.count)
4818 PL_regoffs[ST.paren].end = -1;
4820 REGCP_UNWIND(ST.cp);
4821 /* failed -- move forward one */
4822 PL_reginput = locinput;
4823 if (regrepeat(rex, ST.A, 1, depth)) {
4825 locinput = PL_reginput;
4826 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
4827 ST.count > 0)) /* count overflow ? */
4830 CURLY_SETPAREN(ST.paren, ST.count);
4831 if (cur_eval && cur_eval->u.eval.close_paren &&
4832 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4835 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
4843 /* a successful greedy match: now try to match B */
4844 if (cur_eval && cur_eval->u.eval.close_paren &&
4845 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4850 if (ST.c1 != CHRTEST_VOID)
4851 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
4852 UTF8_MAXBYTES, 0, uniflags)
4853 : (UV) UCHARAT(PL_reginput);
4854 /* If it could work, try it. */
4855 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
4856 CURLY_SETPAREN(ST.paren, ST.count);
4857 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
4862 case CURLY_B_max_fail:
4863 /* failed to find B in a greedy match */
4864 if (ST.paren && ST.count)
4865 PL_regoffs[ST.paren].end = -1;
4867 REGCP_UNWIND(ST.cp);
4869 if (--ST.count < ST.min)
4871 PL_reginput = locinput = HOPc(locinput, -1);
4872 goto curly_try_B_max;
4879 /* we've just finished A in /(??{A})B/; now continue with B */
4881 st->u.eval.toggle_reg_flags
4882 = cur_eval->u.eval.toggle_reg_flags;
4883 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
4885 st->u.eval.prev_rex = rex_sv; /* inner */
4886 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
4887 rex = (struct regexp *)SvANY(rex_sv);
4888 rexi = RXi_GET(rex);
4889 cur_curlyx = cur_eval->u.eval.prev_curlyx;
4890 ReREFCNT_inc(rex_sv);
4891 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
4893 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4894 PL_reglastparen = &rex->lastparen;
4895 PL_reglastcloseparen = &rex->lastcloseparen;
4897 REGCP_SET(st->u.eval.lastcp);
4898 PL_reginput = locinput;
4900 /* Restore parens of the outer rex without popping the
4902 tmpix = PL_savestack_ix;
4903 PL_savestack_ix = cur_eval->u.eval.lastcp;
4905 PL_savestack_ix = tmpix;
4907 st->u.eval.prev_eval = cur_eval;
4908 cur_eval = cur_eval->u.eval.prev_eval;
4910 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
4911 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
4912 if ( nochange_depth )
4915 PUSH_YES_STATE_GOTO(EVAL_AB,
4916 st->u.eval.prev_eval->u.eval.B); /* match B */
4919 if (locinput < reginfo->till) {
4920 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4921 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
4923 (long)(locinput - PL_reg_starttry),
4924 (long)(reginfo->till - PL_reg_starttry),
4927 sayNO_SILENT; /* Cannot match: too short. */
4929 PL_reginput = locinput; /* put where regtry can find it */
4930 sayYES; /* Success! */
4932 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
4934 PerlIO_printf(Perl_debug_log,
4935 "%*s %ssubpattern success...%s\n",
4936 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
4937 PL_reginput = locinput; /* put where regtry can find it */
4938 sayYES; /* Success! */
4941 #define ST st->u.ifmatch
4943 case SUSPEND: /* (?>A) */
4945 PL_reginput = locinput;
4948 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
4950 goto ifmatch_trivial_fail_test;
4952 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
4954 ifmatch_trivial_fail_test:
4956 char * const s = HOPBACKc(locinput, scan->flags);
4961 sw = 1 - (bool)ST.wanted;
4965 next = scan + ARG(scan);
4973 PL_reginput = locinput;
4977 ST.logical = logical;
4978 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
4980 /* execute body of (?...A) */
4981 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
4984 case IFMATCH_A_fail: /* body of (?...A) failed */
4985 ST.wanted = !ST.wanted;
4988 case IFMATCH_A: /* body of (?...A) succeeded */
4990 sw = (bool)ST.wanted;
4992 else if (!ST.wanted)
4995 if (OP(ST.me) == SUSPEND)
4996 locinput = PL_reginput;
4998 locinput = PL_reginput = st->locinput;
4999 nextchr = UCHARAT(locinput);
5001 scan = ST.me + ARG(ST.me);
5004 continue; /* execute B */
5009 next = scan + ARG(scan);
5014 reginfo->cutpoint = PL_regeol;
5017 PL_reginput = locinput;
5019 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5020 PUSH_STATE_GOTO(COMMIT_next,next);
5022 case COMMIT_next_fail:
5029 #define ST st->u.mark
5031 ST.prev_mark = mark_state;
5032 ST.mark_name = sv_commit = sv_yes_mark
5033 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5035 ST.mark_loc = PL_reginput = locinput;
5036 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5038 case MARKPOINT_next:
5039 mark_state = ST.prev_mark;
5042 case MARKPOINT_next_fail:
5043 if (popmark && sv_eq(ST.mark_name,popmark))
5045 if (ST.mark_loc > startpoint)
5046 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5047 popmark = NULL; /* we found our mark */
5048 sv_commit = ST.mark_name;
5051 PerlIO_printf(Perl_debug_log,
5052 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5053 REPORT_CODE_OFF+depth*2, "",
5054 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5057 mark_state = ST.prev_mark;
5058 sv_yes_mark = mark_state ?
5059 mark_state->u.mark.mark_name : NULL;
5063 PL_reginput = locinput;
5065 /* (*SKIP) : if we fail we cut here*/
5066 ST.mark_name = NULL;
5067 ST.mark_loc = locinput;
5068 PUSH_STATE_GOTO(SKIP_next,next);
5070 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5071 otherwise do nothing. Meaning we need to scan
5073 regmatch_state *cur = mark_state;
5074 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5077 if ( sv_eq( cur->u.mark.mark_name,
5080 ST.mark_name = find;
5081 PUSH_STATE_GOTO( SKIP_next, next );
5083 cur = cur->u.mark.prev_mark;
5086 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5088 case SKIP_next_fail:
5090 /* (*CUT:NAME) - Set up to search for the name as we
5091 collapse the stack*/
5092 popmark = ST.mark_name;
5094 /* (*CUT) - No name, we cut here.*/
5095 if (ST.mark_loc > startpoint)
5096 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5097 /* but we set sv_commit to latest mark_name if there
5098 is one so they can test to see how things lead to this
5101 sv_commit=mark_state->u.mark.mark_name;
5109 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5111 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5114 U8 folded[UTF8_MAXBYTES_CASE+1];
5116 const char * const l = locinput;
5117 char *e = PL_regeol;
5118 to_uni_fold(n, folded, &foldlen);
5120 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5121 l, &e, 0, do_utf8)) {
5126 nextchr = UCHARAT(locinput);
5129 if ((n=is_LNBREAK(locinput,do_utf8))) {
5131 nextchr = UCHARAT(locinput);
5136 #define CASE_CLASS(nAmE) \
5138 if ((n=is_##nAmE(locinput,do_utf8))) { \
5140 nextchr = UCHARAT(locinput); \
5145 if ((n=is_##nAmE(locinput,do_utf8))) { \
5148 locinput += UTF8SKIP(locinput); \
5149 nextchr = UCHARAT(locinput); \
5154 CASE_CLASS(HORIZWS);
5158 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5159 PTR2UV(scan), OP(scan));
5160 Perl_croak(aTHX_ "regexp memory corruption");
5164 /* switch break jumps here */
5165 scan = next; /* prepare to execute the next op and ... */
5166 continue; /* ... jump back to the top, reusing st */
5170 /* push a state that backtracks on success */
5171 st->u.yes.prev_yes_state = yes_state;
5175 /* push a new regex state, then continue at scan */
5177 regmatch_state *newst;
5180 regmatch_state *cur = st;
5181 regmatch_state *curyes = yes_state;
5183 regmatch_slab *slab = PL_regmatch_slab;
5184 for (;curd > -1;cur--,curd--) {
5185 if (cur < SLAB_FIRST(slab)) {
5187 cur = SLAB_LAST(slab);
5189 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5190 REPORT_CODE_OFF + 2 + depth * 2,"",
5191 curd, PL_reg_name[cur->resume_state],
5192 (curyes == cur) ? "yes" : ""
5195 curyes = cur->u.yes.prev_yes_state;
5198 DEBUG_STATE_pp("push")
5201 st->locinput = locinput;
5203 if (newst > SLAB_LAST(PL_regmatch_slab))
5204 newst = S_push_slab(aTHX);
5205 PL_regmatch_state = newst;
5207 locinput = PL_reginput;
5208 nextchr = UCHARAT(locinput);
5216 * We get here only if there's trouble -- normally "case END" is
5217 * the terminating point.
5219 Perl_croak(aTHX_ "corrupted regexp pointers");
5225 /* we have successfully completed a subexpression, but we must now
5226 * pop to the state marked by yes_state and continue from there */
5227 assert(st != yes_state);
5229 while (st != yes_state) {
5231 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5232 PL_regmatch_slab = PL_regmatch_slab->prev;
5233 st = SLAB_LAST(PL_regmatch_slab);
5237 DEBUG_STATE_pp("pop (no final)");
5239 DEBUG_STATE_pp("pop (yes)");
5245 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5246 || yes_state > SLAB_LAST(PL_regmatch_slab))
5248 /* not in this slab, pop slab */
5249 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5250 PL_regmatch_slab = PL_regmatch_slab->prev;
5251 st = SLAB_LAST(PL_regmatch_slab);
5253 depth -= (st - yes_state);
5256 yes_state = st->u.yes.prev_yes_state;
5257 PL_regmatch_state = st;
5260 locinput= st->locinput;
5261 nextchr = UCHARAT(locinput);
5263 state_num = st->resume_state + no_final;
5264 goto reenter_switch;
5267 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5268 PL_colors[4], PL_colors[5]));
5270 if (PL_reg_eval_set) {
5271 /* each successfully executed (?{...}) block does the equivalent of
5272 * local $^R = do {...}
5273 * When popping the save stack, all these locals would be undone;
5274 * bypass this by setting the outermost saved $^R to the latest
5276 if (oreplsv != GvSV(PL_replgv))
5277 sv_setsv(oreplsv, GvSV(PL_replgv));
5284 PerlIO_printf(Perl_debug_log,
5285 "%*s %sfailed...%s\n",
5286 REPORT_CODE_OFF+depth*2, "",
5287 PL_colors[4], PL_colors[5])
5299 /* there's a previous state to backtrack to */
5301 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5302 PL_regmatch_slab = PL_regmatch_slab->prev;
5303 st = SLAB_LAST(PL_regmatch_slab);
5305 PL_regmatch_state = st;
5306 locinput= st->locinput;
5307 nextchr = UCHARAT(locinput);
5309 DEBUG_STATE_pp("pop");
5311 if (yes_state == st)
5312 yes_state = st->u.yes.prev_yes_state;
5314 state_num = st->resume_state + 1; /* failure = success + 1 */
5315 goto reenter_switch;
5320 if (rex->intflags & PREGf_VERBARG_SEEN) {
5321 SV *sv_err = get_sv("REGERROR", 1);
5322 SV *sv_mrk = get_sv("REGMARK", 1);
5324 sv_commit = &PL_sv_no;
5326 sv_yes_mark = &PL_sv_yes;
5329 sv_commit = &PL_sv_yes;
5330 sv_yes_mark = &PL_sv_no;
5332 sv_setsv(sv_err, sv_commit);
5333 sv_setsv(sv_mrk, sv_yes_mark);
5336 /* clean up; in particular, free all slabs above current one */
5337 LEAVE_SCOPE(oldsave);
5343 - regrepeat - repeatedly match something simple, report how many
5346 * [This routine now assumes that it will only match on things of length 1.
5347 * That was true before, but now we assume scan - reginput is the count,
5348 * rather than incrementing count on every character. [Er, except utf8.]]
5351 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5354 register char *scan;
5356 register char *loceol = PL_regeol;
5357 register I32 hardcount = 0;
5358 register bool do_utf8 = PL_reg_match_utf8;
5360 PERL_UNUSED_ARG(depth);
5363 PERL_ARGS_ASSERT_REGREPEAT;
5366 if (max == REG_INFTY)
5368 else if (max < loceol - scan)
5369 loceol = scan + max;
5374 while (scan < loceol && hardcount < max && *scan != '\n') {
5375 scan += UTF8SKIP(scan);
5379 while (scan < loceol && *scan != '\n')
5386 while (scan < loceol && hardcount < max) {
5387 scan += UTF8SKIP(scan);
5397 case EXACT: /* length of string is 1 */
5399 while (scan < loceol && UCHARAT(scan) == c)
5402 case EXACTF: /* length of string is 1 */
5404 while (scan < loceol &&
5405 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5408 case EXACTFL: /* length of string is 1 */
5409 PL_reg_flags |= RF_tainted;
5411 while (scan < loceol &&
5412 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5418 while (hardcount < max && scan < loceol &&
5419 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5420 scan += UTF8SKIP(scan);
5424 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5431 LOAD_UTF8_CHARCLASS_ALNUM();
5432 while (hardcount < max && scan < loceol &&
5433 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5434 scan += UTF8SKIP(scan);
5438 while (scan < loceol && isALNUM(*scan))
5443 PL_reg_flags |= RF_tainted;
5446 while (hardcount < max && scan < loceol &&
5447 isALNUM_LC_utf8((U8*)scan)) {
5448 scan += UTF8SKIP(scan);
5452 while (scan < loceol && isALNUM_LC(*scan))
5459 LOAD_UTF8_CHARCLASS_ALNUM();
5460 while (hardcount < max && scan < loceol &&
5461 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5462 scan += UTF8SKIP(scan);
5466 while (scan < loceol && !isALNUM(*scan))
5471 PL_reg_flags |= RF_tainted;
5474 while (hardcount < max && scan < loceol &&
5475 !isALNUM_LC_utf8((U8*)scan)) {
5476 scan += UTF8SKIP(scan);
5480 while (scan < loceol && !isALNUM_LC(*scan))
5487 LOAD_UTF8_CHARCLASS_SPACE();
5488 while (hardcount < max && scan < loceol &&
5490 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5491 scan += UTF8SKIP(scan);
5495 while (scan < loceol && isSPACE(*scan))
5500 PL_reg_flags |= RF_tainted;
5503 while (hardcount < max && scan < loceol &&
5504 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5505 scan += UTF8SKIP(scan);
5509 while (scan < loceol && isSPACE_LC(*scan))
5516 LOAD_UTF8_CHARCLASS_SPACE();
5517 while (hardcount < max && scan < loceol &&
5519 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5520 scan += UTF8SKIP(scan);
5524 while (scan < loceol && !isSPACE(*scan))
5529 PL_reg_flags |= RF_tainted;
5532 while (hardcount < max && scan < loceol &&
5533 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5534 scan += UTF8SKIP(scan);
5538 while (scan < loceol && !isSPACE_LC(*scan))
5545 LOAD_UTF8_CHARCLASS_DIGIT();
5546 while (hardcount < max && scan < loceol &&
5547 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5548 scan += UTF8SKIP(scan);
5552 while (scan < loceol && isDIGIT(*scan))
5559 LOAD_UTF8_CHARCLASS_DIGIT();
5560 while (hardcount < max && scan < loceol &&
5561 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5562 scan += UTF8SKIP(scan);
5566 while (scan < loceol && !isDIGIT(*scan))
5572 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5578 LNBREAK can match two latin chars, which is ok,
5579 because we have a null terminated string, but we
5580 have to use hardcount in this situation
5582 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5591 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5596 while (scan < loceol && is_HORIZWS_latin1(scan))
5603 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5604 scan += UTF8SKIP(scan);
5608 while (scan < loceol && !is_HORIZWS_latin1(scan))
5616 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5621 while (scan < loceol && is_VERTWS_latin1(scan))
5629 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5630 scan += UTF8SKIP(scan);
5634 while (scan < loceol && !is_VERTWS_latin1(scan))
5640 default: /* Called on something of 0 width. */
5641 break; /* So match right here or not at all. */
5647 c = scan - PL_reginput;
5651 GET_RE_DEBUG_FLAGS_DECL;
5653 SV * const prop = sv_newmortal();
5654 regprop(prog, prop, p);
5655 PerlIO_printf(Perl_debug_log,
5656 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5657 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5665 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5667 - regclass_swash - prepare the utf8 swash
5671 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5677 RXi_GET_DECL(prog,progi);
5678 const struct reg_data * const data = prog ? progi->data : NULL;
5680 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5682 if (data && data->count) {
5683 const U32 n = ARG(node);
5685 if (data->what[n] == 's') {
5686 SV * const rv = MUTABLE_SV(data->data[n]);
5687 AV * const av = MUTABLE_AV(SvRV(rv));
5688 SV **const ary = AvARRAY(av);
5691 /* See the end of regcomp.c:S_regclass() for
5692 * documentation of these array elements. */
5695 a = SvROK(ary[1]) ? &ary[1] : NULL;
5696 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5700 else if (si && doinit) {
5701 sw = swash_init("utf8", "", si, 1, 0);
5702 (void)av_store(av, 1, sw);
5719 - reginclass - determine if a character falls into a character class
5721 The n is the ANYOF regnode, the p is the target string, lenp
5722 is pointer to the maximum length of how far to go in the p
5723 (if the lenp is zero, UTF8SKIP(p) is used),
5724 do_utf8 tells whether the target string is in UTF-8.
5729 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5732 const char flags = ANYOF_FLAGS(n);
5738 PERL_ARGS_ASSERT_REGINCLASS;
5740 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5741 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5742 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_CHECK_ONLY);
5743 /* see [perl #37836] for UTF8_ALLOW_ANYUV */
5744 if (len == (STRLEN)-1)
5745 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5748 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
5749 if (do_utf8 || (flags & ANYOF_UNICODE)) {
5752 if (do_utf8 && !ANYOF_RUNTIME(n)) {
5753 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
5756 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
5760 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
5768 utf8_p = bytes_to_utf8(p, &len);
5770 if (swash_fetch(sw, utf8_p, 1))
5772 else if (flags & ANYOF_FOLD) {
5773 if (!match && lenp && av) {
5775 for (i = 0; i <= av_len(av); i++) {
5776 SV* const sv = *av_fetch(av, i, FALSE);
5778 const char * const s = SvPV_const(sv, len);
5779 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
5787 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
5790 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
5791 if (swash_fetch(sw, tmpbuf, 1))
5796 /* If we allocated a string above, free it */
5797 if (! do_utf8) Safefree(utf8_p);
5800 if (match && lenp && *lenp == 0)
5801 *lenp = UNISKIP(NATIVE_TO_UNI(c));
5803 if (!match && c < 256) {
5804 if (ANYOF_BITMAP_TEST(n, c))
5806 else if (flags & ANYOF_FOLD) {
5809 if (flags & ANYOF_LOCALE) {
5810 PL_reg_flags |= RF_tainted;
5811 f = PL_fold_locale[c];
5815 if (f != c && ANYOF_BITMAP_TEST(n, f))
5819 if (!match && (flags & ANYOF_CLASS)) {
5820 PL_reg_flags |= RF_tainted;
5822 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
5823 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
5824 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
5825 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
5826 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
5827 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
5828 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
5829 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
5830 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
5831 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
5832 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
5833 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
5834 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
5835 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
5836 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
5837 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
5838 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
5839 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
5840 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
5841 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
5842 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
5843 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
5844 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
5845 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
5846 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
5847 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
5848 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
5849 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
5850 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
5851 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
5852 ) /* How's that for a conditional? */
5859 return (flags & ANYOF_INVERT) ? !match : match;
5863 S_reghop3(U8 *s, I32 off, const U8* lim)
5867 PERL_ARGS_ASSERT_REGHOP3;
5870 while (off-- && s < lim) {
5871 /* XXX could check well-formedness here */
5876 while (off++ && s > lim) {
5878 if (UTF8_IS_CONTINUED(*s)) {
5879 while (s > lim && UTF8_IS_CONTINUATION(*s))
5882 /* XXX could check well-formedness here */
5889 /* there are a bunch of places where we use two reghop3's that should
5890 be replaced with this routine. but since thats not done yet
5891 we ifdef it out - dmq
5894 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
5898 PERL_ARGS_ASSERT_REGHOP4;
5901 while (off-- && s < rlim) {
5902 /* XXX could check well-formedness here */
5907 while (off++ && s > llim) {
5909 if (UTF8_IS_CONTINUED(*s)) {
5910 while (s > llim && UTF8_IS_CONTINUATION(*s))
5913 /* XXX could check well-formedness here */
5921 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
5925 PERL_ARGS_ASSERT_REGHOPMAYBE3;
5928 while (off-- && s < lim) {
5929 /* XXX could check well-formedness here */
5936 while (off++ && s > lim) {
5938 if (UTF8_IS_CONTINUED(*s)) {
5939 while (s > lim && UTF8_IS_CONTINUATION(*s))
5942 /* XXX could check well-formedness here */
5951 restore_pos(pTHX_ void *arg)
5954 regexp * const rex = (regexp *)arg;
5955 if (PL_reg_eval_set) {
5956 if (PL_reg_oldsaved) {
5957 rex->subbeg = PL_reg_oldsaved;
5958 rex->sublen = PL_reg_oldsavedlen;
5959 #ifdef PERL_OLD_COPY_ON_WRITE
5960 rex->saved_copy = PL_nrs;
5962 RXp_MATCH_COPIED_on(rex);
5964 PL_reg_magic->mg_len = PL_reg_oldpos;
5965 PL_reg_eval_set = 0;
5966 PL_curpm = PL_reg_oldcurpm;
5971 S_to_utf8_substr(pTHX_ register regexp *prog)
5975 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
5978 if (prog->substrs->data[i].substr
5979 && !prog->substrs->data[i].utf8_substr) {
5980 SV* const sv = newSVsv(prog->substrs->data[i].substr);
5981 prog->substrs->data[i].utf8_substr = sv;
5982 sv_utf8_upgrade(sv);
5983 if (SvVALID(prog->substrs->data[i].substr)) {
5984 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
5985 if (flags & FBMcf_TAIL) {
5986 /* Trim the trailing \n that fbm_compile added last
5988 SvCUR_set(sv, SvCUR(sv) - 1);
5989 /* Whilst this makes the SV technically "invalid" (as its
5990 buffer is no longer followed by "\0") when fbm_compile()
5991 adds the "\n" back, a "\0" is restored. */
5993 fbm_compile(sv, flags);
5995 if (prog->substrs->data[i].substr == prog->check_substr)
5996 prog->check_utf8 = sv;
6002 S_to_byte_substr(pTHX_ register regexp *prog)
6007 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6010 if (prog->substrs->data[i].utf8_substr
6011 && !prog->substrs->data[i].substr) {
6012 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6013 if (sv_utf8_downgrade(sv, TRUE)) {
6014 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6016 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6017 if (flags & FBMcf_TAIL) {
6018 /* Trim the trailing \n that fbm_compile added last
6020 SvCUR_set(sv, SvCUR(sv) - 1);
6022 fbm_compile(sv, flags);
6028 prog->substrs->data[i].substr = sv;
6029 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6030 prog->check_substr = sv;
6037 * c-indentation-style: bsd
6039 * indent-tabs-mode: t
6042 * ex: set ts=8 sts=4 sw=4 noet: