5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
75 #undef PERL_IN_XSUB_RE
76 #define PERL_IN_XSUB_RE 1
79 #undef PERL_IN_XSUB_RE
81 #ifdef PERL_IN_XSUB_RE
87 #define RF_tainted 1 /* tainted information used? */
88 #define RF_warned 2 /* warned about big count? */
90 #define RF_utf8 8 /* Pattern contains multibyte chars? */
92 #define UTF ((PL_reg_flags & RF_utf8) != 0)
94 #define RS_init 1 /* eval environment created */
95 #define RS_set 2 /* replsv value is set */
101 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
107 #define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv))
108 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
110 #define HOPc(pos,off) \
111 (char *)(PL_reg_match_utf8 \
112 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
114 #define HOPBACKc(pos, off) \
115 (char*)(PL_reg_match_utf8\
116 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
117 : (pos - off >= PL_bostr) \
121 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
122 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
124 /* these are unrolled below in the CCC_TRY_XXX defined */
125 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
126 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
128 /* Doesn't do an assert to verify that is correct */
129 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
130 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
132 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
133 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
134 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
136 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
137 LOAD_UTF8_CHARCLASS(X_begin, " "); \
138 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
139 /* These are utf8 constants, and not utf-ebcdic constants, so the \
140 * assert should likely and hopefully fail on an EBCDIC machine */ \
141 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
143 /* No asserts are done for these, in case called on an early \
144 * Unicode version in which they map to nothing */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
149 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
151 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
154 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
155 so that it is possible to override the option here without having to
156 rebuild the entire core. as we are required to do if we change regcomp.h
157 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
159 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
160 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
163 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
164 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
165 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
166 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
167 #define RE_utf8_perl_word PL_utf8_alnum
168 #define RE_utf8_perl_space PL_utf8_space
169 #define RE_utf8_posix_digit PL_utf8_digit
170 #define perl_word alnum
171 #define perl_space space
172 #define posix_digit digit
174 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
175 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
176 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
177 #define RE_utf8_perl_word PL_utf8_perl_word
178 #define RE_utf8_perl_space PL_utf8_perl_space
179 #define RE_utf8_posix_digit PL_utf8_posix_digit
183 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
185 PL_reg_flags |= RF_tainted; \
190 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
191 if (!CAT2(PL_utf8_,CLASS)) { \
195 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
199 if (!(OP(scan) == NAME \
200 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
201 : LCFUNC_utf8((U8*)locinput))) \
205 locinput += PL_utf8skip[nextchr]; \
206 nextchr = UCHARAT(locinput); \
209 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
211 nextchr = UCHARAT(++locinput); \
214 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
216 PL_reg_flags |= RF_tainted; \
219 if (!nextchr && locinput >= PL_regeol) \
221 if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \
222 if (!CAT2(PL_utf8_,CLASS)) { \
226 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
230 if ((OP(scan) == NAME \
231 ? (bool)swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8) \
232 : LCFUNC_utf8((U8*)locinput))) \
236 locinput += PL_utf8skip[nextchr]; \
237 nextchr = UCHARAT(locinput); \
240 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
242 nextchr = UCHARAT(++locinput); \
249 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
251 /* for use after a quantifier and before an EXACT-like node -- japhy */
252 /* it would be nice to rework regcomp.sym to generate this stuff. sigh */
253 #define JUMPABLE(rn) ( \
255 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
257 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
258 OP(rn) == PLUS || OP(rn) == MINMOD || \
259 OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \
260 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
262 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
264 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
267 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
268 we don't need this definition. */
269 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
270 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
271 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
274 /* ... so we use this as its faster. */
275 #define IS_TEXT(rn) ( OP(rn)==EXACT )
276 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
277 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
282 Search for mandatory following text node; for lookahead, the text must
283 follow but for lookbehind (rn->flags != 0) we skip to the next step.
285 #define FIND_NEXT_IMPT(rn) STMT_START { \
286 while (JUMPABLE(rn)) { \
287 const OPCODE type = OP(rn); \
288 if (type == SUSPEND || PL_regkind[type] == CURLY) \
289 rn = NEXTOPER(NEXTOPER(rn)); \
290 else if (type == PLUS) \
292 else if (type == IFMATCH) \
293 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
294 else rn += NEXT_OFF(rn); \
299 static void restore_pos(pTHX_ void *arg);
302 S_regcppush(pTHX_ I32 parenfloor)
305 const int retval = PL_savestack_ix;
306 #define REGCP_PAREN_ELEMS 4
307 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
309 GET_RE_DEBUG_FLAGS_DECL;
311 if (paren_elems_to_push < 0)
312 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
314 #define REGCP_OTHER_ELEMS 7
315 SSGROW(paren_elems_to_push + REGCP_OTHER_ELEMS);
317 for (p = PL_regsize; p > parenfloor; p--) {
318 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
319 SSPUSHINT(PL_regoffs[p].end);
320 SSPUSHINT(PL_regoffs[p].start);
321 SSPUSHPTR(PL_reg_start_tmp[p]);
323 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
324 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
325 (UV)p, (IV)PL_regoffs[p].start,
326 (IV)(PL_reg_start_tmp[p] - PL_bostr),
327 (IV)PL_regoffs[p].end
330 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
331 SSPUSHPTR(PL_regoffs);
332 SSPUSHINT(PL_regsize);
333 SSPUSHINT(*PL_reglastparen);
334 SSPUSHINT(*PL_reglastcloseparen);
335 SSPUSHPTR(PL_reginput);
336 #define REGCP_FRAME_ELEMS 2
337 /* REGCP_FRAME_ELEMS are part of the REGCP_OTHER_ELEMS and
338 * are needed for the regexp context stack bookkeeping. */
339 SSPUSHINT(paren_elems_to_push + REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
340 SSPUSHINT(SAVEt_REGCONTEXT); /* Magic cookie. */
345 /* These are needed since we do not localize EVAL nodes: */
346 #define REGCP_SET(cp) \
348 PerlIO_printf(Perl_debug_log, \
349 " Setting an EVAL scope, savestack=%"IVdf"\n", \
350 (IV)PL_savestack_ix)); \
353 #define REGCP_UNWIND(cp) \
355 if (cp != PL_savestack_ix) \
356 PerlIO_printf(Perl_debug_log, \
357 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
358 (IV)(cp), (IV)PL_savestack_ix)); \
362 S_regcppop(pTHX_ const regexp *rex)
367 GET_RE_DEBUG_FLAGS_DECL;
369 PERL_ARGS_ASSERT_REGCPPOP;
371 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
373 assert(i == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
374 i = SSPOPINT; /* Parentheses elements to pop. */
375 input = (char *) SSPOPPTR;
376 *PL_reglastcloseparen = SSPOPINT;
377 *PL_reglastparen = SSPOPINT;
378 PL_regsize = SSPOPINT;
379 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
382 /* Now restore the parentheses context. */
383 for (i -= (REGCP_OTHER_ELEMS - REGCP_FRAME_ELEMS);
384 i > 0; i -= REGCP_PAREN_ELEMS) {
386 U32 paren = (U32)SSPOPINT;
387 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
388 PL_regoffs[paren].start = SSPOPINT;
390 if (paren <= *PL_reglastparen)
391 PL_regoffs[paren].end = tmps;
393 PerlIO_printf(Perl_debug_log,
394 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
395 (UV)paren, (IV)PL_regoffs[paren].start,
396 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
397 (IV)PL_regoffs[paren].end,
398 (paren > *PL_reglastparen ? "(no)" : ""));
402 if (*PL_reglastparen + 1 <= rex->nparens) {
403 PerlIO_printf(Perl_debug_log,
404 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
405 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
409 /* It would seem that the similar code in regtry()
410 * already takes care of this, and in fact it is in
411 * a better location to since this code can #if 0-ed out
412 * but the code in regtry() is needed or otherwise tests
413 * requiring null fields (pat.t#187 and split.t#{13,14}
414 * (as of patchlevel 7877) will fail. Then again,
415 * this code seems to be necessary or otherwise
416 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
417 * --jhi updated by dapm */
418 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
420 PL_regoffs[i].start = -1;
421 PL_regoffs[i].end = -1;
427 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
430 * pregexec and friends
433 #ifndef PERL_IN_XSUB_RE
435 - pregexec - match a regexp against a string
438 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
439 char *strbeg, I32 minend, SV *screamer, U32 nosave)
440 /* strend: pointer to null at end of string */
441 /* strbeg: real beginning of string */
442 /* minend: end of match must be >=minend after stringarg. */
443 /* nosave: For optimizations. */
445 PERL_ARGS_ASSERT_PREGEXEC;
448 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
449 nosave ? 0 : REXEC_COPY_STR);
454 * Need to implement the following flags for reg_anch:
456 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
458 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
459 * INTUIT_AUTORITATIVE_ML
460 * INTUIT_ONCE_NOML - Intuit can match in one location only.
463 * Another flag for this function: SECOND_TIME (so that float substrs
464 * with giant delta may be not rechecked).
467 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
469 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
470 Otherwise, only SvCUR(sv) is used to get strbeg. */
472 /* XXXX We assume that strpos is strbeg unless sv. */
474 /* XXXX Some places assume that there is a fixed substring.
475 An update may be needed if optimizer marks as "INTUITable"
476 RExen without fixed substrings. Similarly, it is assumed that
477 lengths of all the strings are no more than minlen, thus they
478 cannot come from lookahead.
479 (Or minlen should take into account lookahead.)
480 NOTE: Some of this comment is not correct. minlen does now take account
481 of lookahead/behind. Further research is required. -- demerphq
485 /* A failure to find a constant substring means that there is no need to make
486 an expensive call to REx engine, thus we celebrate a failure. Similarly,
487 finding a substring too deep into the string means that less calls to
488 regtry() should be needed.
490 REx compiler's optimizer found 4 possible hints:
491 a) Anchored substring;
493 c) Whether we are anchored (beginning-of-line or \G);
494 d) First node (of those at offset 0) which may distingush positions;
495 We use a)b)d) and multiline-part of c), and try to find a position in the
496 string which does not contradict any of them.
499 /* Most of decisions we do here should have been done at compile time.
500 The nodes of the REx which we used for the search should have been
501 deleted from the finite automaton. */
504 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
505 char *strend, const U32 flags, re_scream_pos_data *data)
508 struct regexp *const prog = (struct regexp *)SvANY(rx);
509 register I32 start_shift = 0;
510 /* Should be nonnegative! */
511 register I32 end_shift = 0;
516 const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
518 register char *other_last = NULL; /* other substr checked before this */
519 char *check_at = NULL; /* check substr found at this pos */
520 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
521 RXi_GET_DECL(prog,progi);
523 const char * const i_strpos = strpos;
525 GET_RE_DEBUG_FLAGS_DECL;
527 PERL_ARGS_ASSERT_RE_INTUIT_START;
529 RX_MATCH_UTF8_set(rx,do_utf8);
532 PL_reg_flags |= RF_utf8;
535 debug_start_match(rx, do_utf8, strpos, strend,
536 sv ? "Guessing start of match in sv for"
537 : "Guessing start of match in string for");
540 /* CHR_DIST() would be more correct here but it makes things slow. */
541 if (prog->minlen > strend - strpos) {
542 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
543 "String too short... [re_intuit_start]\n"));
547 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
550 if (!prog->check_utf8 && prog->check_substr)
551 to_utf8_substr(prog);
552 check = prog->check_utf8;
554 if (!prog->check_substr && prog->check_utf8)
555 to_byte_substr(prog);
556 check = prog->check_substr;
558 if (check == &PL_sv_undef) {
559 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
560 "Non-utf8 string cannot match utf8 check string\n"));
563 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
564 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
565 || ( (prog->extflags & RXf_ANCH_BOL)
566 && !multiline ) ); /* Check after \n? */
569 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
570 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
571 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
573 && (strpos != strbeg)) {
574 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
577 if (prog->check_offset_min == prog->check_offset_max &&
578 !(prog->extflags & RXf_CANY_SEEN)) {
579 /* Substring at constant offset from beg-of-str... */
582 s = HOP3c(strpos, prog->check_offset_min, strend);
585 slen = SvCUR(check); /* >= 1 */
587 if ( strend - s > slen || strend - s < slen - 1
588 || (strend - s == slen && strend[-1] != '\n')) {
589 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
592 /* Now should match s[0..slen-2] */
594 if (slen && (*SvPVX_const(check) != *s
596 && memNE(SvPVX_const(check), s, slen)))) {
598 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
602 else if (*SvPVX_const(check) != *s
603 || ((slen = SvCUR(check)) > 1
604 && memNE(SvPVX_const(check), s, slen)))
607 goto success_at_start;
610 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
612 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
613 end_shift = prog->check_end_shift;
616 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
617 - (SvTAIL(check) != 0);
618 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
620 if (end_shift < eshift)
624 else { /* Can match at random position */
627 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
628 end_shift = prog->check_end_shift;
630 /* end shift should be non negative here */
633 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
635 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
636 (IV)end_shift, RX_PRECOMP(prog));
640 /* Find a possible match in the region s..strend by looking for
641 the "check" substring in the region corrected by start/end_shift. */
644 I32 srch_start_shift = start_shift;
645 I32 srch_end_shift = end_shift;
646 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
647 srch_end_shift -= ((strbeg - s) - srch_start_shift);
648 srch_start_shift = strbeg - s;
650 DEBUG_OPTIMISE_MORE_r({
651 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
652 (IV)prog->check_offset_min,
653 (IV)srch_start_shift,
655 (IV)prog->check_end_shift);
658 if (flags & REXEC_SCREAM) {
659 I32 p = -1; /* Internal iterator of scream. */
660 I32 * const pp = data ? data->scream_pos : &p;
662 if (PL_screamfirst[BmRARE(check)] >= 0
663 || ( BmRARE(check) == '\n'
664 && (BmPREVIOUS(check) == SvCUR(check) - 1)
666 s = screaminstr(sv, check,
667 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
670 /* we may be pointing at the wrong string */
671 if (s && RXp_MATCH_COPIED(prog))
672 s = strbeg + (s - SvPVX_const(sv));
674 *data->scream_olds = s;
679 if (prog->extflags & RXf_CANY_SEEN) {
680 start_point= (U8*)(s + srch_start_shift);
681 end_point= (U8*)(strend - srch_end_shift);
683 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
684 end_point= HOP3(strend, -srch_end_shift, strbeg);
686 DEBUG_OPTIMISE_MORE_r({
687 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
688 (int)(end_point - start_point),
689 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
693 s = fbm_instr( start_point, end_point,
694 check, multiline ? FBMrf_MULTILINE : 0);
697 /* Update the count-of-usability, remove useless subpatterns,
701 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
702 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
703 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
704 (s ? "Found" : "Did not find"),
705 (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)
706 ? "anchored" : "floating"),
709 (s ? " at offset " : "...\n") );
714 /* Finish the diagnostic message */
715 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
717 /* XXX dmq: first branch is for positive lookbehind...
718 Our check string is offset from the beginning of the pattern.
719 So we need to do any stclass tests offset forward from that
728 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
729 Start with the other substr.
730 XXXX no SCREAM optimization yet - and a very coarse implementation
731 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
732 *always* match. Probably should be marked during compile...
733 Probably it is right to do no SCREAM here...
736 if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8)
737 : (prog->float_substr && prog->anchored_substr))
739 /* Take into account the "other" substring. */
740 /* XXXX May be hopelessly wrong for UTF... */
743 if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) {
746 char * const last = HOP3c(s, -start_shift, strbeg);
748 char * const saved_s = s;
751 t = s - prog->check_offset_max;
752 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
754 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
759 t = HOP3c(t, prog->anchored_offset, strend);
760 if (t < other_last) /* These positions already checked */
762 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
765 /* XXXX It is not documented what units *_offsets are in.
766 We assume bytes, but this is clearly wrong.
767 Meaning this code needs to be carefully reviewed for errors.
771 /* On end-of-str: see comment below. */
772 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
773 if (must == &PL_sv_undef) {
775 DEBUG_r(must = prog->anchored_utf8); /* for debug */
780 HOP3(HOP3(last1, prog->anchored_offset, strend)
781 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
783 multiline ? FBMrf_MULTILINE : 0
786 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
787 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
788 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
789 (s ? "Found" : "Contradicts"),
790 quoted, RE_SV_TAIL(must));
795 if (last1 >= last2) {
796 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
797 ", giving up...\n"));
800 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
801 ", trying floating at offset %ld...\n",
802 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
803 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
804 s = HOP3c(last, 1, strend);
808 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
809 (long)(s - i_strpos)));
810 t = HOP3c(s, -prog->anchored_offset, strbeg);
811 other_last = HOP3c(s, 1, strend);
819 else { /* Take into account the floating substring. */
821 char * const saved_s = s;
824 t = HOP3c(s, -start_shift, strbeg);
826 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
827 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
828 last = HOP3c(t, prog->float_max_offset, strend);
829 s = HOP3c(t, prog->float_min_offset, strend);
832 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
833 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
834 /* fbm_instr() takes into account exact value of end-of-str
835 if the check is SvTAIL(ed). Since false positives are OK,
836 and end-of-str is not later than strend we are OK. */
837 if (must == &PL_sv_undef) {
839 DEBUG_r(must = prog->float_utf8); /* for debug message */
842 s = fbm_instr((unsigned char*)s,
843 (unsigned char*)last + SvCUR(must)
845 must, multiline ? FBMrf_MULTILINE : 0);
847 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
848 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
849 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
850 (s ? "Found" : "Contradicts"),
851 quoted, RE_SV_TAIL(must));
855 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
856 ", giving up...\n"));
859 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
860 ", trying anchored starting at offset %ld...\n",
861 (long)(saved_s + 1 - i_strpos)));
863 s = HOP3c(t, 1, strend);
867 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
868 (long)(s - i_strpos)));
869 other_last = s; /* Fix this later. --Hugo */
879 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
881 DEBUG_OPTIMISE_MORE_r(
882 PerlIO_printf(Perl_debug_log,
883 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
884 (IV)prog->check_offset_min,
885 (IV)prog->check_offset_max,
893 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
895 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
898 /* Fixed substring is found far enough so that the match
899 cannot start at strpos. */
901 if (ml_anch && t[-1] != '\n') {
902 /* Eventually fbm_*() should handle this, but often
903 anchored_offset is not 0, so this check will not be wasted. */
904 /* XXXX In the code below we prefer to look for "^" even in
905 presence of anchored substrings. And we search even
906 beyond the found float position. These pessimizations
907 are historical artefacts only. */
909 while (t < strend - prog->minlen) {
911 if (t < check_at - prog->check_offset_min) {
912 if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) {
913 /* Since we moved from the found position,
914 we definitely contradict the found anchored
915 substr. Due to the above check we do not
916 contradict "check" substr.
917 Thus we can arrive here only if check substr
918 is float. Redo checking for "other"=="fixed".
921 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
922 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
923 goto do_other_anchored;
925 /* We don't contradict the found floating substring. */
926 /* XXXX Why not check for STCLASS? */
928 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
929 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
932 /* Position contradicts check-string */
933 /* XXXX probably better to look for check-string
934 than for "\n", so one should lower the limit for t? */
935 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
936 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
937 other_last = strpos = s = t + 1;
942 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
943 PL_colors[0], PL_colors[1]));
947 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
948 PL_colors[0], PL_colors[1]));
952 ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
955 /* The found string does not prohibit matching at strpos,
956 - no optimization of calling REx engine can be performed,
957 unless it was an MBOL and we are not after MBOL,
958 or a future STCLASS check will fail this. */
960 /* Even in this situation we may use MBOL flag if strpos is offset
961 wrt the start of the string. */
962 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
963 && (strpos != strbeg) && strpos[-1] != '\n'
964 /* May be due to an implicit anchor of m{.*foo} */
965 && !(prog->intflags & PREGf_IMPLICIT))
970 DEBUG_EXECUTE_r( if (ml_anch)
971 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
972 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
975 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
977 prog->check_utf8 /* Could be deleted already */
978 && --BmUSEFUL(prog->check_utf8) < 0
979 && (prog->check_utf8 == prog->float_utf8)
981 prog->check_substr /* Could be deleted already */
982 && --BmUSEFUL(prog->check_substr) < 0
983 && (prog->check_substr == prog->float_substr)
986 /* If flags & SOMETHING - do not do it many times on the same match */
987 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
988 /* XXX Does the destruction order has to change with do_utf8? */
989 SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr);
990 SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8);
991 prog->check_substr = prog->check_utf8 = NULL; /* disable */
992 prog->float_substr = prog->float_utf8 = NULL; /* clear */
993 check = NULL; /* abort */
995 /* XXXX This is a remnant of the old implementation. It
996 looks wasteful, since now INTUIT can use many
998 prog->extflags &= ~RXf_USE_INTUIT;
1004 /* Last resort... */
1005 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1006 /* trie stclasses are too expensive to use here, we are better off to
1007 leave it to regmatch itself */
1008 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1009 /* minlen == 0 is possible if regstclass is \b or \B,
1010 and the fixed substr is ''$.
1011 Since minlen is already taken into account, s+1 is before strend;
1012 accidentally, minlen >= 1 guaranties no false positives at s + 1
1013 even for \b or \B. But (minlen? 1 : 0) below assumes that
1014 regstclass does not come from lookahead... */
1015 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1016 This leaves EXACTF only, which is dealt with in find_byclass(). */
1017 const U8* const str = (U8*)STRING(progi->regstclass);
1018 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1019 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1022 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1023 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1024 else if (prog->float_substr || prog->float_utf8)
1025 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1029 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1030 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1033 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1036 const char *what = NULL;
1038 if (endpos == strend) {
1039 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1040 "Could not match STCLASS...\n") );
1043 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1044 "This position contradicts STCLASS...\n") );
1045 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1047 /* Contradict one of substrings */
1048 if (prog->anchored_substr || prog->anchored_utf8) {
1049 if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1050 DEBUG_EXECUTE_r( what = "anchored" );
1052 s = HOP3c(t, 1, strend);
1053 if (s + start_shift + end_shift > strend) {
1054 /* XXXX Should be taken into account earlier? */
1055 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1056 "Could not match STCLASS...\n") );
1061 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1062 "Looking for %s substr starting at offset %ld...\n",
1063 what, (long)(s + start_shift - i_strpos)) );
1066 /* Have both, check_string is floating */
1067 if (t + start_shift >= check_at) /* Contradicts floating=check */
1068 goto retry_floating_check;
1069 /* Recheck anchored substring, but not floating... */
1073 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1074 "Looking for anchored substr starting at offset %ld...\n",
1075 (long)(other_last - i_strpos)) );
1076 goto do_other_anchored;
1078 /* Another way we could have checked stclass at the
1079 current position only: */
1084 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1085 "Looking for /%s^%s/m starting at offset %ld...\n",
1086 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1089 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1091 /* Check is floating subtring. */
1092 retry_floating_check:
1093 t = check_at - start_shift;
1094 DEBUG_EXECUTE_r( what = "floating" );
1095 goto hop_and_restart;
1098 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1099 "By STCLASS: moving %ld --> %ld\n",
1100 (long)(t - i_strpos), (long)(s - i_strpos))
1104 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1105 "Does not contradict STCLASS...\n");
1110 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1111 PL_colors[4], (check ? "Guessed" : "Giving up"),
1112 PL_colors[5], (long)(s - i_strpos)) );
1115 fail_finish: /* Substring not found */
1116 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1117 BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1119 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1120 PL_colors[4], PL_colors[5]));
1124 #define DECL_TRIE_TYPE(scan) \
1125 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1126 trie_type = (scan->flags != EXACT) \
1127 ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \
1128 : (do_utf8 ? trie_utf8 : trie_plain)
1130 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1131 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1132 switch (trie_type) { \
1133 case trie_utf8_fold: \
1134 if ( foldlen>0 ) { \
1135 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1140 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1141 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1142 foldlen -= UNISKIP( uvc ); \
1143 uscan = foldbuf + UNISKIP( uvc ); \
1146 case trie_latin_utf8_fold: \
1147 if ( foldlen>0 ) { \
1148 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1154 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1155 foldlen -= UNISKIP( uvc ); \
1156 uscan = foldbuf + UNISKIP( uvc ); \
1160 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1167 charid = trie->charmap[ uvc ]; \
1171 if (widecharmap) { \
1172 SV** const svpp = hv_fetch(widecharmap, \
1173 (char*)&uvc, sizeof(UV), 0); \
1175 charid = (U16)SvIV(*svpp); \
1180 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1182 char *my_strend= (char *)strend; \
1185 !ibcmp_utf8(s, &my_strend, 0, do_utf8, \
1186 m, NULL, ln, (bool)UTF)) \
1187 && (!reginfo || regtry(reginfo, &s)) ) \
1190 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1191 uvchr_to_utf8(tmpbuf, c); \
1192 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1194 && (f == c1 || f == c2) \
1196 !ibcmp_utf8(s, &my_strend, 0, do_utf8,\
1197 m, NULL, ln, (bool)UTF)) \
1198 && (!reginfo || regtry(reginfo, &s)) ) \
1204 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1208 && (ln == 1 || !(OP(c) == EXACTF \
1210 : ibcmp_locale(s, m, ln))) \
1211 && (!reginfo || regtry(reginfo, &s)) ) \
1217 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1219 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1225 #define REXEC_FBC_SCAN(CoDe) \
1227 while (s < strend) { \
1233 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1234 REXEC_FBC_UTF8_SCAN( \
1236 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1245 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1248 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1257 #define REXEC_FBC_TRYIT \
1258 if ((!reginfo || regtry(reginfo, &s))) \
1261 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1263 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1266 REXEC_FBC_CLASS_SCAN(CoNd); \
1270 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1273 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1276 REXEC_FBC_CLASS_SCAN(CoNd); \
1280 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1281 PL_reg_flags |= RF_tainted; \
1283 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1286 REXEC_FBC_CLASS_SCAN(CoNd); \
1290 #define DUMP_EXEC_POS(li,s,doutf8) \
1291 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1293 /* We know what class REx starts with. Try to find this position... */
1294 /* if reginfo is NULL, its a dryrun */
1295 /* annoyingly all the vars in this routine have different names from their counterparts
1296 in regmatch. /grrr */
1299 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1300 const char *strend, regmatch_info *reginfo)
1303 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1307 register STRLEN uskip;
1311 register I32 tmp = 1; /* Scratch variable? */
1312 register const bool do_utf8 = PL_reg_match_utf8;
1313 RXi_GET_DECL(prog,progi);
1315 PERL_ARGS_ASSERT_FIND_BYCLASS;
1317 /* We know what class it must start with. */
1321 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1322 !UTF8_IS_INVARIANT((U8)s[0]) ?
1323 reginclass(prog, c, (U8*)s, 0, do_utf8) :
1324 REGINCLASS(prog, c, (U8*)s));
1327 while (s < strend) {
1330 if (REGINCLASS(prog, c, (U8*)s) ||
1331 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1332 /* The assignment of 2 is intentional:
1333 * for the folded sharp s, the skip is 2. */
1334 (skip = SHARP_S_SKIP))) {
1335 if (tmp && (!reginfo || regtry(reginfo, &s)))
1348 if (tmp && (!reginfo || regtry(reginfo, &s)))
1356 ln = STR_LEN(c); /* length to match in octets/bytes */
1357 lnc = (I32) ln; /* length to match in characters */
1359 STRLEN ulen1, ulen2;
1361 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1362 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1363 /* used by commented-out code below */
1364 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1366 /* XXX: Since the node will be case folded at compile
1367 time this logic is a little odd, although im not
1368 sure that its actually wrong. --dmq */
1370 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1371 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1373 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1374 codepoint of the first character in the converted
1375 form, yet originally we did the extra step.
1376 No tests fail by commenting this code out however
1377 so Ive left it out. -- dmq.
1379 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1381 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1386 while (sm < ((U8 *) m + ln)) {
1401 c2 = PL_fold_locale[c1];
1403 e = HOP3c(strend, -((I32)lnc), s);
1405 if (!reginfo && e < s)
1406 e = s; /* Due to minlen logic of intuit() */
1408 /* The idea in the EXACTF* cases is to first find the
1409 * first character of the EXACTF* node and then, if
1410 * necessary, case-insensitively compare the full
1411 * text of the node. The c1 and c2 are the first
1412 * characters (though in Unicode it gets a bit
1413 * more complicated because there are more cases
1414 * than just upper and lower: one needs to use
1415 * the so-called folding case for case-insensitive
1416 * matching (called "loose matching" in Unicode).
1417 * ibcmp_utf8() will do just that. */
1419 if (do_utf8 || UTF) {
1421 U8 tmpbuf [UTF8_MAXBYTES+1];
1424 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1426 /* Upper and lower of 1st char are equal -
1427 * probably not a "letter". */
1430 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1435 REXEC_FBC_EXACTISH_CHECK(c == c1);
1441 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1447 /* Handle some of the three Greek sigmas cases.
1448 * Note that not all the possible combinations
1449 * are handled here: some of them are handled
1450 * by the standard folding rules, and some of
1451 * them (the character class or ANYOF cases)
1452 * are handled during compiletime in
1453 * regexec.c:S_regclass(). */
1454 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1455 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1456 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1458 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1463 /* Neither pattern nor string are UTF8 */
1465 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1467 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1471 PL_reg_flags |= RF_tainted;
1478 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1479 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1481 tmp = ((OP(c) == BOUND ?
1482 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1483 LOAD_UTF8_CHARCLASS_ALNUM();
1484 REXEC_FBC_UTF8_SCAN(
1485 if (tmp == !(OP(c) == BOUND ?
1486 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1487 isALNUM_LC_utf8((U8*)s)))
1495 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1496 tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1499 !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) {
1505 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1509 PL_reg_flags |= RF_tainted;
1516 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1517 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1519 tmp = ((OP(c) == NBOUND ?
1520 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1521 LOAD_UTF8_CHARCLASS_ALNUM();
1522 REXEC_FBC_UTF8_SCAN(
1523 if (tmp == !(OP(c) == NBOUND ?
1524 (bool)swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8) :
1525 isALNUM_LC_utf8((U8*)s)))
1527 else REXEC_FBC_TRYIT;
1531 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1532 tmp = ((OP(c) == NBOUND ?
1533 isALNUM(tmp) : isALNUM_LC(tmp)) != 0);
1536 !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s)))
1538 else REXEC_FBC_TRYIT;
1541 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1545 REXEC_FBC_CSCAN_PRELOAD(
1546 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1547 swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1551 REXEC_FBC_CSCAN_TAINT(
1552 isALNUM_LC_utf8((U8*)s),
1556 REXEC_FBC_CSCAN_PRELOAD(
1557 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1558 !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8),
1562 REXEC_FBC_CSCAN_TAINT(
1563 !isALNUM_LC_utf8((U8*)s),
1567 REXEC_FBC_CSCAN_PRELOAD(
1568 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1569 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8),
1573 REXEC_FBC_CSCAN_TAINT(
1574 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1578 REXEC_FBC_CSCAN_PRELOAD(
1579 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1580 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)),
1584 REXEC_FBC_CSCAN_TAINT(
1585 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1589 REXEC_FBC_CSCAN_PRELOAD(
1590 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1591 swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1595 REXEC_FBC_CSCAN_TAINT(
1596 isDIGIT_LC_utf8((U8*)s),
1600 REXEC_FBC_CSCAN_PRELOAD(
1601 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1602 !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8),
1606 REXEC_FBC_CSCAN_TAINT(
1607 !isDIGIT_LC_utf8((U8*)s),
1613 is_LNBREAK_latin1(s)
1623 !is_VERTWS_latin1(s)
1628 is_HORIZWS_latin1(s)
1632 !is_HORIZWS_utf8(s),
1633 !is_HORIZWS_latin1(s)
1639 /* what trie are we using right now */
1641 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1643 = (reg_trie_data*)progi->data->data[ aho->trie ];
1644 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1646 const char *last_start = strend - trie->minlen;
1648 const char *real_start = s;
1650 STRLEN maxlen = trie->maxlen;
1652 U8 **points; /* map of where we were in the input string
1653 when reading a given char. For ASCII this
1654 is unnecessary overhead as the relationship
1655 is always 1:1, but for Unicode, especially
1656 case folded Unicode this is not true. */
1657 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1661 GET_RE_DEBUG_FLAGS_DECL;
1663 /* We can't just allocate points here. We need to wrap it in
1664 * an SV so it gets freed properly if there is a croak while
1665 * running the match */
1668 sv_points=newSV(maxlen * sizeof(U8 *));
1669 SvCUR_set(sv_points,
1670 maxlen * sizeof(U8 *));
1671 SvPOK_on(sv_points);
1672 sv_2mortal(sv_points);
1673 points=(U8**)SvPV_nolen(sv_points );
1674 if ( trie_type != trie_utf8_fold
1675 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1678 bitmap=(U8*)trie->bitmap;
1680 bitmap=(U8*)ANYOF_BITMAP(c);
1682 /* this is the Aho-Corasick algorithm modified a touch
1683 to include special handling for long "unknown char"
1684 sequences. The basic idea being that we use AC as long
1685 as we are dealing with a possible matching char, when
1686 we encounter an unknown char (and we have not encountered
1687 an accepting state) we scan forward until we find a legal
1689 AC matching is basically that of trie matching, except
1690 that when we encounter a failing transition, we fall back
1691 to the current states "fail state", and try the current char
1692 again, a process we repeat until we reach the root state,
1693 state 1, or a legal transition. If we fail on the root state
1694 then we can either terminate if we have reached an accepting
1695 state previously, or restart the entire process from the beginning
1699 while (s <= last_start) {
1700 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1708 U8 *uscan = (U8*)NULL;
1709 U8 *leftmost = NULL;
1711 U32 accepted_word= 0;
1715 while ( state && uc <= (U8*)strend ) {
1717 U32 word = aho->states[ state ].wordnum;
1721 DEBUG_TRIE_EXECUTE_r(
1722 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1723 dump_exec_pos( (char *)uc, c, strend, real_start,
1724 (char *)uc, do_utf8 );
1725 PerlIO_printf( Perl_debug_log,
1726 " Scanning for legal start char...\n");
1729 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1734 if (uc >(U8*)last_start) break;
1738 U8 *lpos= points[ (pointpos - trie->wordlen[word-1] ) % maxlen ];
1739 if (!leftmost || lpos < leftmost) {
1740 DEBUG_r(accepted_word=word);
1746 points[pointpos++ % maxlen]= uc;
1747 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1748 uscan, len, uvc, charid, foldlen,
1750 DEBUG_TRIE_EXECUTE_r({
1751 dump_exec_pos( (char *)uc, c, strend, real_start,
1753 PerlIO_printf(Perl_debug_log,
1754 " Charid:%3u CP:%4"UVxf" ",
1760 word = aho->states[ state ].wordnum;
1762 base = aho->states[ state ].trans.base;
1764 DEBUG_TRIE_EXECUTE_r({
1766 dump_exec_pos( (char *)uc, c, strend, real_start,
1768 PerlIO_printf( Perl_debug_log,
1769 "%sState: %4"UVxf", word=%"UVxf,
1770 failed ? " Fail transition to " : "",
1771 (UV)state, (UV)word);
1776 (base + charid > trie->uniquecharcount )
1777 && (base + charid - 1 - trie->uniquecharcount
1779 && trie->trans[base + charid - 1 -
1780 trie->uniquecharcount].check == state
1781 && (tmp=trie->trans[base + charid - 1 -
1782 trie->uniquecharcount ].next))
1784 DEBUG_TRIE_EXECUTE_r(
1785 PerlIO_printf( Perl_debug_log," - legal\n"));
1790 DEBUG_TRIE_EXECUTE_r(
1791 PerlIO_printf( Perl_debug_log," - fail\n"));
1793 state = aho->fail[state];
1797 /* we must be accepting here */
1798 DEBUG_TRIE_EXECUTE_r(
1799 PerlIO_printf( Perl_debug_log," - accepting\n"));
1808 if (!state) state = 1;
1811 if ( aho->states[ state ].wordnum ) {
1812 U8 *lpos = points[ (pointpos - trie->wordlen[aho->states[ state ].wordnum-1]) % maxlen ];
1813 if (!leftmost || lpos < leftmost) {
1814 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1819 s = (char*)leftmost;
1820 DEBUG_TRIE_EXECUTE_r({
1822 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1823 (UV)accepted_word, (IV)(s - real_start)
1826 if (!reginfo || regtry(reginfo, &s)) {
1832 DEBUG_TRIE_EXECUTE_r({
1833 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1836 DEBUG_TRIE_EXECUTE_r(
1837 PerlIO_printf( Perl_debug_log,"No match.\n"));
1846 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1856 - regexec_flags - match a regexp against a string
1859 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1860 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1861 /* strend: pointer to null at end of string */
1862 /* strbeg: real beginning of string */
1863 /* minend: end of match must be >=minend after stringarg. */
1864 /* data: May be used for some additional optimizations.
1865 Currently its only used, with a U32 cast, for transmitting
1866 the ganch offset when doing a /g match. This will change */
1867 /* nosave: For optimizations. */
1870 struct regexp *const prog = (struct regexp *)SvANY(rx);
1871 /*register*/ char *s;
1872 register regnode *c;
1873 /*register*/ char *startpos = stringarg;
1874 I32 minlen; /* must match at least this many chars */
1875 I32 dontbother = 0; /* how many characters not to try at end */
1876 I32 end_shift = 0; /* Same for the end. */ /* CC */
1877 I32 scream_pos = -1; /* Internal iterator of scream. */
1878 char *scream_olds = NULL;
1879 const bool do_utf8 = (bool)DO_UTF8(sv);
1881 RXi_GET_DECL(prog,progi);
1882 regmatch_info reginfo; /* create some info to pass to regtry etc */
1883 regexp_paren_pair *swap = NULL;
1884 GET_RE_DEBUG_FLAGS_DECL;
1886 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1887 PERL_UNUSED_ARG(data);
1889 /* Be paranoid... */
1890 if (prog == NULL || startpos == NULL) {
1891 Perl_croak(aTHX_ "NULL regexp parameter");
1895 multiline = prog->extflags & RXf_PMf_MULTILINE;
1896 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1898 RX_MATCH_UTF8_set(rx, do_utf8);
1900 debug_start_match(rx, do_utf8, startpos, strend,
1904 minlen = prog->minlen;
1906 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1907 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1908 "String too short [regexec_flags]...\n"));
1913 /* Check validity of program. */
1914 if (UCHARAT(progi->program) != REG_MAGIC) {
1915 Perl_croak(aTHX_ "corrupted regexp program");
1919 PL_reg_eval_set = 0;
1923 PL_reg_flags |= RF_utf8;
1925 /* Mark beginning of line for ^ and lookbehind. */
1926 reginfo.bol = startpos; /* XXX not used ??? */
1930 /* Mark end of line for $ (and such) */
1933 /* see how far we have to get to not match where we matched before */
1934 reginfo.till = startpos+minend;
1936 /* If there is a "must appear" string, look for it. */
1939 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1941 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
1942 reginfo.ganch = startpos + prog->gofs;
1943 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1944 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1945 } else if (sv && SvTYPE(sv) >= SVt_PVMG
1947 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1948 && mg->mg_len >= 0) {
1949 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
1950 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1951 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
1953 if (prog->extflags & RXf_ANCH_GPOS) {
1954 if (s > reginfo.ganch)
1956 s = reginfo.ganch - prog->gofs;
1957 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1958 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
1964 reginfo.ganch = strbeg + PTR2UV(data);
1965 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1966 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
1968 } else { /* pos() not defined */
1969 reginfo.ganch = strbeg;
1970 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1971 "GPOS: reginfo.ganch = strbeg\n"));
1974 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
1975 /* We have to be careful. If the previous successful match
1976 was from this regex we don't want a subsequent partially
1977 successful match to clobber the old results.
1978 So when we detect this possibility we add a swap buffer
1979 to the re, and switch the buffer each match. If we fail
1980 we switch it back, otherwise we leave it swapped.
1983 /* do we need a save destructor here for eval dies? */
1984 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
1986 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
1987 re_scream_pos_data d;
1989 d.scream_olds = &scream_olds;
1990 d.scream_pos = &scream_pos;
1991 s = re_intuit_start(rx, sv, s, strend, flags, &d);
1993 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
1994 goto phooey; /* not present */
2000 /* Simplest case: anchored match need be tried only once. */
2001 /* [unless only anchor is BOL and multiline is set] */
2002 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2003 if (s == startpos && regtry(®info, &startpos))
2005 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2006 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2011 dontbother = minlen - 1;
2012 end = HOP3c(strend, -dontbother, strbeg) - 1;
2013 /* for multiline we only have to try after newlines */
2014 if (prog->check_substr || prog->check_utf8) {
2015 /* because of the goto we can not easily reuse the macros for bifurcating the
2016 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2019 goto after_try_utf8;
2021 if (regtry(®info, &s)) {
2028 if (prog->extflags & RXf_USE_INTUIT) {
2029 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2038 } /* end search for check string in unicode */
2040 if (s == startpos) {
2041 goto after_try_latin;
2044 if (regtry(®info, &s)) {
2051 if (prog->extflags & RXf_USE_INTUIT) {
2052 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2061 } /* end search for check string in latin*/
2062 } /* end search for check string */
2063 else { /* search for newline */
2065 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2068 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2070 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2071 if (regtry(®info, &s))
2075 } /* end search for newline */
2076 } /* end anchored/multiline check string search */
2078 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2080 /* the warning about reginfo.ganch being used without intialization
2081 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2082 and we only enter this block when the same bit is set. */
2083 char *tmp_s = reginfo.ganch - prog->gofs;
2085 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2090 /* Messy cases: unanchored match. */
2091 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2092 /* we have /x+whatever/ */
2093 /* it must be a one character string (XXXX Except UTF?) */
2098 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2099 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2100 ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0];
2105 DEBUG_EXECUTE_r( did_match = 1 );
2106 if (regtry(®info, &s)) goto got_it;
2108 while (s < strend && *s == ch)
2116 DEBUG_EXECUTE_r( did_match = 1 );
2117 if (regtry(®info, &s)) goto got_it;
2119 while (s < strend && *s == ch)
2124 DEBUG_EXECUTE_r(if (!did_match)
2125 PerlIO_printf(Perl_debug_log,
2126 "Did not find anchored character...\n")
2129 else if (prog->anchored_substr != NULL
2130 || prog->anchored_utf8 != NULL
2131 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2132 && prog->float_max_offset < strend - s)) {
2137 char *last1; /* Last position checked before */
2141 if (prog->anchored_substr || prog->anchored_utf8) {
2142 if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr))
2143 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2144 must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr;
2145 back_max = back_min = prog->anchored_offset;
2147 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2148 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2149 must = do_utf8 ? prog->float_utf8 : prog->float_substr;
2150 back_max = prog->float_max_offset;
2151 back_min = prog->float_min_offset;
2155 if (must == &PL_sv_undef)
2156 /* could not downgrade utf8 check substring, so must fail */
2162 last = HOP3c(strend, /* Cannot start after this */
2163 -(I32)(CHR_SVLEN(must)
2164 - (SvTAIL(must) != 0) + back_min), strbeg);
2167 last1 = HOPc(s, -1);
2169 last1 = s - 1; /* bogus */
2171 /* XXXX check_substr already used to find "s", can optimize if
2172 check_substr==must. */
2174 dontbother = end_shift;
2175 strend = HOPc(strend, -dontbother);
2176 while ( (s <= last) &&
2177 ((flags & REXEC_SCREAM)
2178 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2179 end_shift, &scream_pos, 0))
2180 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2181 (unsigned char*)strend, must,
2182 multiline ? FBMrf_MULTILINE : 0))) ) {
2183 /* we may be pointing at the wrong string */
2184 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2185 s = strbeg + (s - SvPVX_const(sv));
2186 DEBUG_EXECUTE_r( did_match = 1 );
2187 if (HOPc(s, -back_max) > last1) {
2188 last1 = HOPc(s, -back_min);
2189 s = HOPc(s, -back_max);
2192 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2194 last1 = HOPc(s, -back_min);
2198 while (s <= last1) {
2199 if (regtry(®info, &s))
2205 while (s <= last1) {
2206 if (regtry(®info, &s))
2212 DEBUG_EXECUTE_r(if (!did_match) {
2213 RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0),
2214 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2215 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2216 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2217 ? "anchored" : "floating"),
2218 quoted, RE_SV_TAIL(must));
2222 else if ( (c = progi->regstclass) ) {
2224 const OPCODE op = OP(progi->regstclass);
2225 /* don't bother with what can't match */
2226 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2227 strend = HOPc(strend, -(minlen - 1));
2230 SV * const prop = sv_newmortal();
2231 regprop(prog, prop, c);
2233 RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1),
2235 PerlIO_printf(Perl_debug_log,
2236 "Matching stclass %.*s against %s (%d chars)\n",
2237 (int)SvCUR(prop), SvPVX_const(prop),
2238 quoted, (int)(strend - s));
2241 if (find_byclass(prog, c, s, strend, ®info))
2243 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2247 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2252 if (!(do_utf8 ? prog->float_utf8 : prog->float_substr))
2253 do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog);
2254 float_real = do_utf8 ? prog->float_utf8 : prog->float_substr;
2256 if (flags & REXEC_SCREAM) {
2257 last = screaminstr(sv, float_real, s - strbeg,
2258 end_shift, &scream_pos, 1); /* last one */
2260 last = scream_olds; /* Only one occurrence. */
2261 /* we may be pointing at the wrong string */
2262 else if (RXp_MATCH_COPIED(prog))
2263 s = strbeg + (s - SvPVX_const(sv));
2267 const char * const little = SvPV_const(float_real, len);
2269 if (SvTAIL(float_real)) {
2270 if (memEQ(strend - len + 1, little, len - 1))
2271 last = strend - len + 1;
2272 else if (!multiline)
2273 last = memEQ(strend - len, little, len)
2274 ? strend - len : NULL;
2280 last = rninstr(s, strend, little, little + len);
2282 last = strend; /* matching "$" */
2287 PerlIO_printf(Perl_debug_log,
2288 "%sCan't trim the tail, match fails (should not happen)%s\n",
2289 PL_colors[4], PL_colors[5]));
2290 goto phooey; /* Should not happen! */
2292 dontbother = strend - last + prog->float_min_offset;
2294 if (minlen && (dontbother < minlen))
2295 dontbother = minlen - 1;
2296 strend -= dontbother; /* this one's always in bytes! */
2297 /* We don't know much -- general case. */
2300 if (regtry(®info, &s))
2309 if (regtry(®info, &s))
2311 } while (s++ < strend);
2320 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2322 if (PL_reg_eval_set)
2323 restore_pos(aTHX_ prog);
2324 if (RXp_PAREN_NAMES(prog))
2325 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2327 /* make sure $`, $&, $', and $digit will work later */
2328 if ( !(flags & REXEC_NOT_FIRST) ) {
2329 RX_MATCH_COPY_FREE(rx);
2330 if (flags & REXEC_COPY_STR) {
2331 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2332 #ifdef PERL_OLD_COPY_ON_WRITE
2334 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2336 PerlIO_printf(Perl_debug_log,
2337 "Copy on write: regexp capture, type %d\n",
2340 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2341 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2342 assert (SvPOKp(prog->saved_copy));
2346 RX_MATCH_COPIED_on(rx);
2347 s = savepvn(strbeg, i);
2353 prog->subbeg = strbeg;
2354 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2361 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2362 PL_colors[4], PL_colors[5]));
2363 if (PL_reg_eval_set)
2364 restore_pos(aTHX_ prog);
2366 /* we failed :-( roll it back */
2367 Safefree(prog->offs);
2376 - regtry - try match at specific point
2378 STATIC I32 /* 0 failure, 1 success */
2379 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2383 REGEXP *const rx = reginfo->prog;
2384 regexp *const prog = (struct regexp *)SvANY(rx);
2385 RXi_GET_DECL(prog,progi);
2386 GET_RE_DEBUG_FLAGS_DECL;
2388 PERL_ARGS_ASSERT_REGTRY;
2390 reginfo->cutpoint=NULL;
2392 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2395 PL_reg_eval_set = RS_init;
2396 DEBUG_EXECUTE_r(DEBUG_s(
2397 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2398 (IV)(PL_stack_sp - PL_stack_base));
2401 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2402 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2404 /* Apparently this is not needed, judging by wantarray. */
2405 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2406 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2409 /* Make $_ available to executed code. */
2410 if (reginfo->sv != DEFSV) {
2412 DEFSV_set(reginfo->sv);
2415 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2416 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2417 /* prepare for quick setting of pos */
2418 #ifdef PERL_OLD_COPY_ON_WRITE
2419 if (SvIsCOW(reginfo->sv))
2420 sv_force_normal_flags(reginfo->sv, 0);
2422 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2423 &PL_vtbl_mglob, NULL, 0);
2427 PL_reg_oldpos = mg->mg_len;
2428 SAVEDESTRUCTOR_X(restore_pos, prog);
2430 if (!PL_reg_curpm) {
2431 Newxz(PL_reg_curpm, 1, PMOP);
2434 SV* const repointer = &PL_sv_undef;
2435 /* this regexp is also owned by the new PL_reg_curpm, which
2436 will try to free it. */
2437 av_push(PL_regex_padav, repointer);
2438 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2439 PL_regex_pad = AvARRAY(PL_regex_padav);
2444 /* It seems that non-ithreads works both with and without this code.
2445 So for efficiency reasons it seems best not to have the code
2446 compiled when it is not needed. */
2447 /* This is safe against NULLs: */
2448 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2449 /* PM_reg_curpm owns a reference to this regexp. */
2452 PM_SETRE(PL_reg_curpm, rx);
2453 PL_reg_oldcurpm = PL_curpm;
2454 PL_curpm = PL_reg_curpm;
2455 if (RXp_MATCH_COPIED(prog)) {
2456 /* Here is a serious problem: we cannot rewrite subbeg,
2457 since it may be needed if this match fails. Thus
2458 $` inside (?{}) could fail... */
2459 PL_reg_oldsaved = prog->subbeg;
2460 PL_reg_oldsavedlen = prog->sublen;
2461 #ifdef PERL_OLD_COPY_ON_WRITE
2462 PL_nrs = prog->saved_copy;
2464 RXp_MATCH_COPIED_off(prog);
2467 PL_reg_oldsaved = NULL;
2468 prog->subbeg = PL_bostr;
2469 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2471 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2472 prog->offs[0].start = *startpos - PL_bostr;
2473 PL_reginput = *startpos;
2474 PL_reglastparen = &prog->lastparen;
2475 PL_reglastcloseparen = &prog->lastcloseparen;
2476 prog->lastparen = 0;
2477 prog->lastcloseparen = 0;
2479 PL_regoffs = prog->offs;
2480 if (PL_reg_start_tmpl <= prog->nparens) {
2481 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2482 if(PL_reg_start_tmp)
2483 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2485 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2488 /* XXXX What this code is doing here?!!! There should be no need
2489 to do this again and again, PL_reglastparen should take care of
2492 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2493 * Actually, the code in regcppop() (which Ilya may be meaning by
2494 * PL_reglastparen), is not needed at all by the test suite
2495 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2496 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2497 * Meanwhile, this code *is* needed for the
2498 * above-mentioned test suite tests to succeed. The common theme
2499 * on those tests seems to be returning null fields from matches.
2500 * --jhi updated by dapm */
2502 if (prog->nparens) {
2503 regexp_paren_pair *pp = PL_regoffs;
2505 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2513 if (regmatch(reginfo, progi->program + 1)) {
2514 PL_regoffs[0].end = PL_reginput - PL_bostr;
2517 if (reginfo->cutpoint)
2518 *startpos= reginfo->cutpoint;
2519 REGCP_UNWIND(lastcp);
2524 #define sayYES goto yes
2525 #define sayNO goto no
2526 #define sayNO_SILENT goto no_silent
2528 /* we dont use STMT_START/END here because it leads to
2529 "unreachable code" warnings, which are bogus, but distracting. */
2530 #define CACHEsayNO \
2531 if (ST.cache_mask) \
2532 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2535 /* this is used to determine how far from the left messages like
2536 'failed...' are printed. It should be set such that messages
2537 are inline with the regop output that created them.
2539 #define REPORT_CODE_OFF 32
2542 /* Make sure there is a test for this +1 options in re_tests */
2543 #define TRIE_INITAL_ACCEPT_BUFFLEN 4;
2545 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2546 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2548 #define SLAB_FIRST(s) (&(s)->states[0])
2549 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2551 /* grab a new slab and return the first slot in it */
2553 STATIC regmatch_state *
2556 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2559 regmatch_slab *s = PL_regmatch_slab->next;
2561 Newx(s, 1, regmatch_slab);
2562 s->prev = PL_regmatch_slab;
2564 PL_regmatch_slab->next = s;
2566 PL_regmatch_slab = s;
2567 return SLAB_FIRST(s);
2571 /* push a new state then goto it */
2573 #define PUSH_STATE_GOTO(state, node) \
2575 st->resume_state = state; \
2578 /* push a new state with success backtracking, then goto it */
2580 #define PUSH_YES_STATE_GOTO(state, node) \
2582 st->resume_state = state; \
2583 goto push_yes_state;
2589 regmatch() - main matching routine
2591 This is basically one big switch statement in a loop. We execute an op,
2592 set 'next' to point the next op, and continue. If we come to a point which
2593 we may need to backtrack to on failure such as (A|B|C), we push a
2594 backtrack state onto the backtrack stack. On failure, we pop the top
2595 state, and re-enter the loop at the state indicated. If there are no more
2596 states to pop, we return failure.
2598 Sometimes we also need to backtrack on success; for example /A+/, where
2599 after successfully matching one A, we need to go back and try to
2600 match another one; similarly for lookahead assertions: if the assertion
2601 completes successfully, we backtrack to the state just before the assertion
2602 and then carry on. In these cases, the pushed state is marked as
2603 'backtrack on success too'. This marking is in fact done by a chain of
2604 pointers, each pointing to the previous 'yes' state. On success, we pop to
2605 the nearest yes state, discarding any intermediate failure-only states.
2606 Sometimes a yes state is pushed just to force some cleanup code to be
2607 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2608 it to free the inner regex.
2610 Note that failure backtracking rewinds the cursor position, while
2611 success backtracking leaves it alone.
2613 A pattern is complete when the END op is executed, while a subpattern
2614 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2615 ops trigger the "pop to last yes state if any, otherwise return true"
2618 A common convention in this function is to use A and B to refer to the two
2619 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2620 the subpattern to be matched possibly multiple times, while B is the entire
2621 rest of the pattern. Variable and state names reflect this convention.
2623 The states in the main switch are the union of ops and failure/success of
2624 substates associated with with that op. For example, IFMATCH is the op
2625 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2626 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2627 successfully matched A and IFMATCH_A_fail is a state saying that we have
2628 just failed to match A. Resume states always come in pairs. The backtrack
2629 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2630 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2631 on success or failure.
2633 The struct that holds a backtracking state is actually a big union, with
2634 one variant for each major type of op. The variable st points to the
2635 top-most backtrack struct. To make the code clearer, within each
2636 block of code we #define ST to alias the relevant union.
2638 Here's a concrete example of a (vastly oversimplified) IFMATCH
2644 #define ST st->u.ifmatch
2646 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2647 ST.foo = ...; // some state we wish to save
2649 // push a yes backtrack state with a resume value of
2650 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2652 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2655 case IFMATCH_A: // we have successfully executed A; now continue with B
2657 bar = ST.foo; // do something with the preserved value
2660 case IFMATCH_A_fail: // A failed, so the assertion failed
2661 ...; // do some housekeeping, then ...
2662 sayNO; // propagate the failure
2669 For any old-timers reading this who are familiar with the old recursive
2670 approach, the code above is equivalent to:
2672 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2681 ...; // do some housekeeping, then ...
2682 sayNO; // propagate the failure
2685 The topmost backtrack state, pointed to by st, is usually free. If you
2686 want to claim it, populate any ST.foo fields in it with values you wish to
2687 save, then do one of
2689 PUSH_STATE_GOTO(resume_state, node);
2690 PUSH_YES_STATE_GOTO(resume_state, node);
2692 which sets that backtrack state's resume value to 'resume_state', pushes a
2693 new free entry to the top of the backtrack stack, then goes to 'node'.
2694 On backtracking, the free slot is popped, and the saved state becomes the
2695 new free state. An ST.foo field in this new top state can be temporarily
2696 accessed to retrieve values, but once the main loop is re-entered, it
2697 becomes available for reuse.
2699 Note that the depth of the backtrack stack constantly increases during the
2700 left-to-right execution of the pattern, rather than going up and down with
2701 the pattern nesting. For example the stack is at its maximum at Z at the
2702 end of the pattern, rather than at X in the following:
2704 /(((X)+)+)+....(Y)+....Z/
2706 The only exceptions to this are lookahead/behind assertions and the cut,
2707 (?>A), which pop all the backtrack states associated with A before
2710 Bascktrack state structs are allocated in slabs of about 4K in size.
2711 PL_regmatch_state and st always point to the currently active state,
2712 and PL_regmatch_slab points to the slab currently containing
2713 PL_regmatch_state. The first time regmatch() is called, the first slab is
2714 allocated, and is never freed until interpreter destruction. When the slab
2715 is full, a new one is allocated and chained to the end. At exit from
2716 regmatch(), slabs allocated since entry are freed.
2721 #define DEBUG_STATE_pp(pp) \
2723 DUMP_EXEC_POS(locinput, scan, do_utf8); \
2724 PerlIO_printf(Perl_debug_log, \
2725 " %*s"pp" %s%s%s%s%s\n", \
2727 PL_reg_name[st->resume_state], \
2728 ((st==yes_state||st==mark_state) ? "[" : ""), \
2729 ((st==yes_state) ? "Y" : ""), \
2730 ((st==mark_state) ? "M" : ""), \
2731 ((st==yes_state||st==mark_state) ? "]" : "") \
2736 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2741 S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8,
2742 const char *start, const char *end, const char *blurb)
2744 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2746 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2751 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2752 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2754 RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1),
2755 start, end - start, 60);
2757 PerlIO_printf(Perl_debug_log,
2758 "%s%s REx%s %s against %s\n",
2759 PL_colors[4], blurb, PL_colors[5], s0, s1);
2761 if (do_utf8||utf8_pat)
2762 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2763 utf8_pat ? "pattern" : "",
2764 utf8_pat && do_utf8 ? " and " : "",
2765 do_utf8 ? "string" : ""
2771 S_dump_exec_pos(pTHX_ const char *locinput,
2772 const regnode *scan,
2773 const char *loc_regeol,
2774 const char *loc_bostr,
2775 const char *loc_reg_starttry,
2778 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2779 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2780 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2781 /* The part of the string before starttry has one color
2782 (pref0_len chars), between starttry and current
2783 position another one (pref_len - pref0_len chars),
2784 after the current position the third one.
2785 We assume that pref0_len <= pref_len, otherwise we
2786 decrease pref0_len. */
2787 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2788 ? (5 + taill) - l : locinput - loc_bostr;
2791 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2793 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2795 pref0_len = pref_len - (locinput - loc_reg_starttry);
2796 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2797 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2798 ? (5 + taill) - pref_len : loc_regeol - locinput);
2799 while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2803 if (pref0_len > pref_len)
2804 pref0_len = pref_len;
2806 const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0;
2808 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2809 (locinput - pref_len),pref0_len, 60, 4, 5);
2811 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2812 (locinput - pref_len + pref0_len),
2813 pref_len - pref0_len, 60, 2, 3);
2815 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2816 locinput, loc_regeol - locinput, 10, 0, 1);
2818 const STRLEN tlen=len0+len1+len2;
2819 PerlIO_printf(Perl_debug_log,
2820 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2821 (IV)(locinput - loc_bostr),
2824 (docolor ? "" : "> <"),
2826 (int)(tlen > 19 ? 0 : 19 - tlen),
2833 /* reg_check_named_buff_matched()
2834 * Checks to see if a named buffer has matched. The data array of
2835 * buffer numbers corresponding to the buffer is expected to reside
2836 * in the regexp->data->data array in the slot stored in the ARG() of
2837 * node involved. Note that this routine doesn't actually care about the
2838 * name, that information is not preserved from compilation to execution.
2839 * Returns the index of the leftmost defined buffer with the given name
2840 * or 0 if non of the buffers matched.
2843 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2846 RXi_GET_DECL(rex,rexi);
2847 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2848 I32 *nums=(I32*)SvPVX(sv_dat);
2850 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2852 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2853 if ((I32)*PL_reglastparen >= nums[n] &&
2854 PL_regoffs[nums[n]].end != -1)
2863 /* free all slabs above current one - called during LEAVE_SCOPE */
2866 S_clear_backtrack_stack(pTHX_ void *p)
2868 regmatch_slab *s = PL_regmatch_slab->next;
2873 PL_regmatch_slab->next = NULL;
2875 regmatch_slab * const osl = s;
2882 #define SETREX(Re1,Re2) \
2883 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2886 STATIC I32 /* 0 failure, 1 success */
2887 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2889 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2893 register const bool do_utf8 = PL_reg_match_utf8;
2894 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2895 REGEXP *rex_sv = reginfo->prog;
2896 regexp *rex = (struct regexp *)SvANY(rex_sv);
2897 RXi_GET_DECL(rex,rexi);
2899 /* the current state. This is a cached copy of PL_regmatch_state */
2900 register regmatch_state *st;
2901 /* cache heavy used fields of st in registers */
2902 register regnode *scan;
2903 register regnode *next;
2904 register U32 n = 0; /* general value; init to avoid compiler warning */
2905 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2906 register char *locinput = PL_reginput;
2907 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2909 bool result = 0; /* return value of S_regmatch */
2910 int depth = 0; /* depth of backtrack stack */
2911 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2912 const U32 max_nochange_depth =
2913 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2914 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2915 regmatch_state *yes_state = NULL; /* state to pop to on success of
2917 /* mark_state piggy backs on the yes_state logic so that when we unwind
2918 the stack on success we can update the mark_state as we go */
2919 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2920 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2921 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2923 bool no_final = 0; /* prevent failure from backtracking? */
2924 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2925 char *startpoint = PL_reginput;
2926 SV *popmark = NULL; /* are we looking for a mark? */
2927 SV *sv_commit = NULL; /* last mark name seen in failure */
2928 SV *sv_yes_mark = NULL; /* last mark name we have seen
2929 during a successfull match */
2930 U32 lastopen = 0; /* last open we saw */
2931 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2932 SV* const oreplsv = GvSV(PL_replgv);
2933 /* these three flags are set by various ops to signal information to
2934 * the very next op. They have a useful lifetime of exactly one loop
2935 * iteration, and are not preserved or restored by state pushes/pops
2937 bool sw = 0; /* the condition value in (?(cond)a|b) */
2938 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2939 int logical = 0; /* the following EVAL is:
2943 or the following IFMATCH/UNLESSM is:
2944 false: plain (?=foo)
2945 true: used as a condition: (?(?=foo))
2948 GET_RE_DEBUG_FLAGS_DECL;
2951 PERL_ARGS_ASSERT_REGMATCH;
2953 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
2954 PerlIO_printf(Perl_debug_log,"regmatch start\n");
2956 /* on first ever call to regmatch, allocate first slab */
2957 if (!PL_regmatch_slab) {
2958 Newx(PL_regmatch_slab, 1, regmatch_slab);
2959 PL_regmatch_slab->prev = NULL;
2960 PL_regmatch_slab->next = NULL;
2961 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2964 oldsave = PL_savestack_ix;
2965 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
2966 SAVEVPTR(PL_regmatch_slab);
2967 SAVEVPTR(PL_regmatch_state);
2969 /* grab next free state slot */
2970 st = ++PL_regmatch_state;
2971 if (st > SLAB_LAST(PL_regmatch_slab))
2972 st = PL_regmatch_state = S_push_slab(aTHX);
2974 /* Note that nextchr is a byte even in UTF */
2975 nextchr = UCHARAT(locinput);
2977 while (scan != NULL) {
2980 SV * const prop = sv_newmortal();
2981 regnode *rnext=regnext(scan);
2982 DUMP_EXEC_POS( locinput, scan, do_utf8 );
2983 regprop(rex, prop, scan);
2985 PerlIO_printf(Perl_debug_log,
2986 "%3"IVdf":%*s%s(%"IVdf")\n",
2987 (IV)(scan - rexi->program), depth*2, "",
2989 (PL_regkind[OP(scan)] == END || !rnext) ?
2990 0 : (IV)(rnext - rexi->program));
2993 next = scan + NEXT_OFF(scan);
2996 state_num = OP(scan);
2998 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3001 assert(PL_reglastparen == &rex->lastparen);
3002 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3003 assert(PL_regoffs == rex->offs);
3005 switch (state_num) {
3007 if (locinput == PL_bostr)
3009 /* reginfo->till = reginfo->bol; */
3014 if (locinput == PL_bostr ||
3015 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3021 if (locinput == PL_bostr)
3025 if (locinput == reginfo->ganch)
3030 /* update the startpoint */
3031 st->u.keeper.val = PL_regoffs[0].start;
3032 PL_reginput = locinput;
3033 PL_regoffs[0].start = locinput - PL_bostr;
3034 PUSH_STATE_GOTO(KEEPS_next, next);
3036 case KEEPS_next_fail:
3037 /* rollback the start point change */
3038 PL_regoffs[0].start = st->u.keeper.val;
3044 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3049 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3051 if (PL_regeol - locinput > 1)
3055 if (PL_regeol != locinput)
3059 if (!nextchr && locinput >= PL_regeol)
3062 locinput += PL_utf8skip[nextchr];
3063 if (locinput > PL_regeol)
3065 nextchr = UCHARAT(locinput);
3068 nextchr = UCHARAT(++locinput);
3071 if (!nextchr && locinput >= PL_regeol)
3073 nextchr = UCHARAT(++locinput);
3076 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3079 locinput += PL_utf8skip[nextchr];
3080 if (locinput > PL_regeol)
3082 nextchr = UCHARAT(locinput);
3085 nextchr = UCHARAT(++locinput);
3089 #define ST st->u.trie
3091 /* In this case the charclass data is available inline so
3092 we can fail fast without a lot of extra overhead.
3094 if (scan->flags == EXACT || !do_utf8) {
3095 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3097 PerlIO_printf(Perl_debug_log,
3098 "%*s %sfailed to match trie start class...%s\n",
3099 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3108 /* what type of TRIE am I? (utf8 makes this contextual) */
3109 DECL_TRIE_TYPE(scan);
3111 /* what trie are we using right now */
3112 reg_trie_data * const trie
3113 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3114 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3115 U32 state = trie->startstate;
3117 if (trie->bitmap && trie_type != trie_utf8_fold &&
3118 !TRIE_BITMAP_TEST(trie,*locinput)
3120 if (trie->states[ state ].wordnum) {
3122 PerlIO_printf(Perl_debug_log,
3123 "%*s %smatched empty string...%s\n",
3124 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3129 PerlIO_printf(Perl_debug_log,
3130 "%*s %sfailed to match trie start class...%s\n",
3131 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3138 U8 *uc = ( U8* )locinput;
3142 U8 *uscan = (U8*)NULL;
3144 SV *sv_accept_buff = NULL;
3145 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3147 ST.accepted = 0; /* how many accepting states we have seen */
3149 ST.jump = trie->jump;
3152 traverse the TRIE keeping track of all accepting states
3153 we transition through until we get to a failing node.
3156 while ( state && uc <= (U8*)PL_regeol ) {
3157 U32 base = trie->states[ state ].trans.base;
3160 /* We use charid to hold the wordnum as we don't use it
3161 for charid until after we have done the wordnum logic.
3162 We define an alias just so that the wordnum logic reads
3165 #define got_wordnum charid
3166 got_wordnum = trie->states[ state ].wordnum;
3168 if ( got_wordnum ) {
3169 if ( ! ST.accepted ) {
3171 SAVETMPS; /* XXX is this necessary? dmq */
3172 bufflen = TRIE_INITAL_ACCEPT_BUFFLEN;
3173 sv_accept_buff=newSV(bufflen *
3174 sizeof(reg_trie_accepted) - 1);
3175 SvCUR_set(sv_accept_buff, 0);
3176 SvPOK_on(sv_accept_buff);
3177 sv_2mortal(sv_accept_buff);
3180 (reg_trie_accepted*)SvPV_nolen(sv_accept_buff );
3183 if (ST.accepted >= bufflen) {
3185 ST.accept_buff =(reg_trie_accepted*)
3186 SvGROW(sv_accept_buff,
3187 bufflen * sizeof(reg_trie_accepted));
3189 SvCUR_set(sv_accept_buff,SvCUR(sv_accept_buff)
3190 + sizeof(reg_trie_accepted));
3193 ST.accept_buff[ST.accepted].wordnum = got_wordnum;
3194 ST.accept_buff[ST.accepted].endpos = uc;
3196 } while (trie->nextword && (got_wordnum= trie->nextword[got_wordnum]));
3200 DEBUG_TRIE_EXECUTE_r({
3201 DUMP_EXEC_POS( (char *)uc, scan, do_utf8 );
3202 PerlIO_printf( Perl_debug_log,
3203 "%*s %sState: %4"UVxf" Accepted: %4"UVxf" ",
3204 2+depth * 2, "", PL_colors[4],
3205 (UV)state, (UV)ST.accepted );
3209 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3210 uscan, len, uvc, charid, foldlen,
3214 (base + charid > trie->uniquecharcount )
3215 && (base + charid - 1 - trie->uniquecharcount
3217 && trie->trans[base + charid - 1 -
3218 trie->uniquecharcount].check == state)
3220 state = trie->trans[base + charid - 1 -
3221 trie->uniquecharcount ].next;
3232 DEBUG_TRIE_EXECUTE_r(
3233 PerlIO_printf( Perl_debug_log,
3234 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3235 charid, uvc, (UV)state, PL_colors[5] );
3242 PerlIO_printf( Perl_debug_log,
3243 "%*s %sgot %"IVdf" possible matches%s\n",
3244 REPORT_CODE_OFF + depth * 2, "",
3245 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3248 goto trie_first_try; /* jump into the fail handler */
3250 case TRIE_next_fail: /* we failed - try next alterative */
3252 REGCP_UNWIND(ST.cp);
3253 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3254 PL_regoffs[n].end = -1;
3255 *PL_reglastparen = n;
3264 ST.lastparen = *PL_reglastparen;
3267 if ( ST.accepted == 1 ) {
3268 /* only one choice left - just continue */
3270 AV *const trie_words
3271 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3272 SV ** const tmp = av_fetch( trie_words,
3273 ST.accept_buff[ 0 ].wordnum-1, 0 );
3274 SV *sv= tmp ? sv_newmortal() : NULL;
3276 PerlIO_printf( Perl_debug_log,
3277 "%*s %sonly one match left: #%d <%s>%s\n",
3278 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3279 ST.accept_buff[ 0 ].wordnum,
3280 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3281 PL_colors[0], PL_colors[1],
3282 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3284 : "not compiled under -Dr",
3287 PL_reginput = (char *)ST.accept_buff[ 0 ].endpos;
3288 /* in this case we free tmps/leave before we call regmatch
3289 as we wont be using accept_buff again. */
3291 locinput = PL_reginput;
3292 nextchr = UCHARAT(locinput);
3293 if ( !ST.jump || !ST.jump[ST.accept_buff[0].wordnum])
3296 scan = ST.me + ST.jump[ST.accept_buff[0].wordnum];
3297 if (!has_cutgroup) {
3302 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3305 continue; /* execute rest of RE */
3308 if ( !ST.accepted-- ) {
3310 PerlIO_printf( Perl_debug_log,
3311 "%*s %sTRIE failed...%s\n",
3312 REPORT_CODE_OFF+depth*2, "",
3323 There are at least two accepting states left. Presumably
3324 the number of accepting states is going to be low,
3325 typically two. So we simply scan through to find the one
3326 with lowest wordnum. Once we find it, we swap the last
3327 state into its place and decrement the size. We then try to
3328 match the rest of the pattern at the point where the word
3329 ends. If we succeed, control just continues along the
3330 regex; if we fail we return here to try the next accepting
3337 for( cur = 1 ; cur <= ST.accepted ; cur++ ) {
3338 DEBUG_TRIE_EXECUTE_r(
3339 PerlIO_printf( Perl_debug_log,
3340 "%*s %sgot %"IVdf" (%d) as best, looking at %"IVdf" (%d)%s\n",
3341 REPORT_CODE_OFF + depth * 2, "", PL_colors[4],
3342 (IV)best, ST.accept_buff[ best ].wordnum, (IV)cur,
3343 ST.accept_buff[ cur ].wordnum, PL_colors[5] );
3346 if (ST.accept_buff[cur].wordnum <
3347 ST.accept_buff[best].wordnum)
3352 AV *const trie_words
3353 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3354 SV ** const tmp = av_fetch( trie_words,
3355 ST.accept_buff[ best ].wordnum - 1, 0 );
3356 regnode *nextop=(!ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) ?
3358 ST.me + ST.jump[ST.accept_buff[best].wordnum];
3359 SV *sv= tmp ? sv_newmortal() : NULL;
3361 PerlIO_printf( Perl_debug_log,
3362 "%*s %strying alternation #%d <%s> at node #%d %s\n",
3363 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3364 ST.accept_buff[best].wordnum,
3365 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3366 PL_colors[0], PL_colors[1],
3367 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3368 ) : "not compiled under -Dr",
3369 REG_NODE_NUM(nextop),
3373 if ( best<ST.accepted ) {
3374 reg_trie_accepted tmp = ST.accept_buff[ best ];
3375 ST.accept_buff[ best ] = ST.accept_buff[ ST.accepted ];
3376 ST.accept_buff[ ST.accepted ] = tmp;
3379 PL_reginput = (char *)ST.accept_buff[ best ].endpos;
3380 if ( !ST.jump || !ST.jump[ST.accept_buff[best].wordnum]) {
3383 scan = ST.me + ST.jump[ST.accept_buff[best].wordnum];
3385 PUSH_YES_STATE_GOTO(TRIE_next, scan);
3390 /* we dont want to throw this away, see bug 57042*/
3391 if (oreplsv != GvSV(PL_replgv))
3392 sv_setsv(oreplsv, GvSV(PL_replgv));
3399 char *s = STRING(scan);
3401 if (do_utf8 != UTF) {
3402 /* The target and the pattern have differing utf8ness. */
3404 const char * const e = s + ln;
3407 /* The target is utf8, the pattern is not utf8. */
3412 if (NATIVE_TO_UNI(*(U8*)s) !=
3413 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3421 /* The target is not utf8, the pattern is utf8. */
3426 if (NATIVE_TO_UNI(*((U8*)l)) !=
3427 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3435 nextchr = UCHARAT(locinput);
3438 /* The target and the pattern have the same utf8ness. */
3439 /* Inline the first character, for speed. */
3440 if (UCHARAT(s) != nextchr)
3442 if (PL_regeol - locinput < ln)
3444 if (ln > 1 && memNE(s, locinput, ln))
3447 nextchr = UCHARAT(locinput);
3451 PL_reg_flags |= RF_tainted;
3454 char * const s = STRING(scan);
3457 if (do_utf8 || UTF) {
3458 /* Either target or the pattern are utf8. */
3459 const char * const l = locinput;
3460 char *e = PL_regeol;
3462 if (ibcmp_utf8(s, 0, ln, (bool)UTF,
3463 l, &e, 0, do_utf8)) {
3464 /* One more case for the sharp s:
3465 * pack("U0U*", 0xDF) =~ /ss/i,
3466 * the 0xC3 0x9F are the UTF-8
3467 * byte sequence for the U+00DF. */
3470 toLOWER(s[0]) == 's' &&
3472 toLOWER(s[1]) == 's' &&
3479 nextchr = UCHARAT(locinput);
3483 /* Neither the target and the pattern are utf8. */
3485 /* Inline the first character, for speed. */
3486 if (UCHARAT(s) != nextchr &&
3487 UCHARAT(s) != ((OP(scan) == EXACTF)
3488 ? PL_fold : PL_fold_locale)[nextchr])
3490 if (PL_regeol - locinput < ln)
3492 if (ln > 1 && (OP(scan) == EXACTF
3493 ? ibcmp(s, locinput, ln)
3494 : ibcmp_locale(s, locinput, ln)))
3497 nextchr = UCHARAT(locinput);
3502 PL_reg_flags |= RF_tainted;
3506 /* was last char in word? */
3508 if (locinput == PL_bostr)
3511 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3513 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3515 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3516 ln = isALNUM_uni(ln);
3517 LOAD_UTF8_CHARCLASS_ALNUM();
3518 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8);
3521 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3522 n = isALNUM_LC_utf8((U8*)locinput);
3526 ln = (locinput != PL_bostr) ?
3527 UCHARAT(locinput - 1) : '\n';
3528 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3530 n = isALNUM(nextchr);
3533 ln = isALNUM_LC(ln);
3534 n = isALNUM_LC(nextchr);
3537 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3538 OP(scan) == BOUNDL))
3543 STRLEN inclasslen = PL_regeol - locinput;
3545 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8))
3547 if (locinput >= PL_regeol)
3549 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3550 nextchr = UCHARAT(locinput);
3555 nextchr = UCHARAT(locinput);
3556 if (!REGINCLASS(rex, scan, (U8*)locinput))
3558 if (!nextchr && locinput >= PL_regeol)
3560 nextchr = UCHARAT(++locinput);
3564 /* If we might have the case of the German sharp s
3565 * in a casefolding Unicode character class. */
3567 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3568 locinput += SHARP_S_SKIP;
3569 nextchr = UCHARAT(locinput);
3574 /* Special char classes - The defines start on line 129 or so */
3575 CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3576 CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC);
3578 CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3579 CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC);
3581 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3582 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3584 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3585 a Unicode extended Grapheme Cluster */
3586 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3587 extended Grapheme Cluster is:
3590 | Prepend* Begin Extend*
3593 Begin is (Hangul-syllable | ! Control)
3594 Extend is (Grapheme_Extend | Spacing_Mark)
3595 Control is [ GCB_Control CR LF ]
3597 The discussion below shows how the code for CLUMP is derived
3598 from this regex. Note that most of these concepts are from
3599 property values of the Grapheme Cluster Boundary (GCB) property.
3600 No code point can have multiple property values for a given
3601 property. Thus a code point in Prepend can't be in Control, but
3602 it must be in !Control. This is why Control above includes
3603 GCB_Control plus CR plus LF. The latter two are used in the GCB
3604 property separately, and so can't be in GCB_Control, even though
3605 they logically are controls. Control is not the same as gc=cc,
3606 but includes format and other characters as well.
3608 The Unicode definition of Hangul-syllable is:
3610 | (L* ( ( V | LV ) V* | LVT ) T*)
3613 Each of these is a value for the GCB property, and hence must be
3614 disjoint, so the order they are tested is immaterial, so the
3615 above can safely be changed to
3618 | (L* ( LVT | ( V | LV ) V*) T*)
3620 The last two terms can be combined like this:
3622 | (( LVT | ( V | LV ) V*) T*))
3624 And refactored into this:
3625 L* (L | LVT T* | V V* T* | LV V* T*)
3627 That means that if we have seen any L's at all we can quit
3628 there, but if the next character is a LVT, a V or and LV we
3631 There is a subtlety with Prepend* which showed up in testing.
3632 Note that the Begin, and only the Begin is required in:
3633 | Prepend* Begin Extend*
3634 Also, Begin contains '! Control'. A Prepend must be a '!
3635 Control', which means it must be a Begin. What it comes down to
3636 is that if we match Prepend* and then find no suitable Begin
3637 afterwards, that if we backtrack the last Prepend, that one will
3638 be a suitable Begin.
3641 if (locinput >= PL_regeol)
3645 /* Match either CR LF or '.', as all the other possibilities
3647 locinput++; /* Match the . or CR */
3649 && locinput < PL_regeol
3650 && UCHARAT(locinput) == '\n') locinput++;
3654 /* Utf8: See if is ( CR LF ); already know that locinput <
3655 * PL_regeol, so locinput+1 is in bounds */
3656 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3660 /* In case have to backtrack to beginning, then match '.' */
3661 char *starting = locinput;
3663 /* In case have to backtrack the last prepend */
3664 char *previous_prepend = 0;
3666 LOAD_UTF8_CHARCLASS_GCB();
3668 /* Match (prepend)* */
3669 while (locinput < PL_regeol
3670 && swash_fetch(PL_utf8_X_prepend,
3671 (U8*)locinput, do_utf8))
3673 previous_prepend = locinput;
3674 locinput += UTF8SKIP(locinput);
3677 /* As noted above, if we matched a prepend character, but
3678 * the next thing won't match, back off the last prepend we
3679 * matched, as it is guaranteed to match the begin */
3680 if (previous_prepend
3681 && (locinput >= PL_regeol
3682 || ! swash_fetch(PL_utf8_X_begin,
3683 (U8*)locinput, do_utf8)))
3685 locinput = previous_prepend;
3688 /* Note that here we know PL_regeol > locinput, as we
3689 * tested that upon input to this switch case, and if we
3690 * moved locinput forward, we tested the result just above
3691 * and it either passed, or we backed off so that it will
3693 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) {
3695 /* Here did not match the required 'Begin' in the
3696 * second term. So just match the very first
3697 * character, the '.' of the final term of the regex */
3698 locinput = starting + UTF8SKIP(starting);
3701 /* Here is the beginning of a character that can have
3702 * an extender. It is either a hangul syllable, or a
3704 if (swash_fetch(PL_utf8_X_non_hangul,
3705 (U8*)locinput, do_utf8))
3708 /* Here not a Hangul syllable, must be a
3709 * ('! * Control') */
3710 locinput += UTF8SKIP(locinput);
3713 /* Here is a Hangul syllable. It can be composed
3714 * of several individual characters. One
3715 * possibility is T+ */
3716 if (swash_fetch(PL_utf8_X_T,
3717 (U8*)locinput, do_utf8))
3719 while (locinput < PL_regeol
3720 && swash_fetch(PL_utf8_X_T,
3721 (U8*)locinput, do_utf8))
3723 locinput += UTF8SKIP(locinput);
3727 /* Here, not T+, but is a Hangul. That means
3728 * it is one of the others: L, LV, LVT or V,
3730 * L* (L | LVT T* | V V* T* | LV V* T*) */
3733 while (locinput < PL_regeol
3734 && swash_fetch(PL_utf8_X_L,
3735 (U8*)locinput, do_utf8))
3737 locinput += UTF8SKIP(locinput);
3740 /* Here, have exhausted L*. If the next
3741 * character is not an LV, LVT nor V, it means
3742 * we had to have at least one L, so matches L+
3743 * in the original equation, we have a complete
3744 * hangul syllable. Are done. */
3746 if (locinput < PL_regeol
3747 && swash_fetch(PL_utf8_X_LV_LVT_V,
3748 (U8*)locinput, do_utf8))
3751 /* Otherwise keep going. Must be LV, LVT
3752 * or V. See if LVT */
3753 if (swash_fetch(PL_utf8_X_LVT,
3754 (U8*)locinput, do_utf8))
3756 locinput += UTF8SKIP(locinput);
3759 /* Must be V or LV. Take it, then
3761 locinput += UTF8SKIP(locinput);
3762 while (locinput < PL_regeol
3763 && swash_fetch(PL_utf8_X_V,
3764 (U8*)locinput, do_utf8))
3766 locinput += UTF8SKIP(locinput);
3770 /* And any of LV, LVT, or V can be followed
3772 while (locinput < PL_regeol
3773 && swash_fetch(PL_utf8_X_T,
3777 locinput += UTF8SKIP(locinput);
3783 /* Match any extender */
3784 while (locinput < PL_regeol
3785 && swash_fetch(PL_utf8_X_extend,
3786 (U8*)locinput, do_utf8))
3788 locinput += UTF8SKIP(locinput);
3792 if (locinput > PL_regeol) sayNO;
3794 nextchr = UCHARAT(locinput);
3801 PL_reg_flags |= RF_tainted;
3806 n = reg_check_named_buff_matched(rex,scan);
3809 type = REF + ( type - NREF );
3816 PL_reg_flags |= RF_tainted;
3820 n = ARG(scan); /* which paren pair */
3823 ln = PL_regoffs[n].start;
3824 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3825 if (*PL_reglastparen < n || ln == -1)
3826 sayNO; /* Do not match unless seen CLOSEn. */
3827 if (ln == PL_regoffs[n].end)
3831 if (do_utf8 && type != REF) { /* REF can do byte comparison */
3833 const char *e = PL_bostr + PL_regoffs[n].end;
3835 * Note that we can't do the "other character" lookup trick as
3836 * in the 8-bit case (no pun intended) because in Unicode we
3837 * have to map both upper and title case to lower case.
3841 STRLEN ulen1, ulen2;
3842 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3843 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3847 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3848 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3849 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3856 nextchr = UCHARAT(locinput);
3860 /* Inline the first character, for speed. */
3861 if (UCHARAT(s) != nextchr &&
3863 (UCHARAT(s) != (type == REFF
3864 ? PL_fold : PL_fold_locale)[nextchr])))
3866 ln = PL_regoffs[n].end - ln;
3867 if (locinput + ln > PL_regeol)
3869 if (ln > 1 && (type == REF
3870 ? memNE(s, locinput, ln)
3872 ? ibcmp(s, locinput, ln)
3873 : ibcmp_locale(s, locinput, ln))))
3876 nextchr = UCHARAT(locinput);
3886 #define ST st->u.eval
3891 regexp_internal *rei;
3892 regnode *startpoint;
3895 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3896 if (cur_eval && cur_eval->locinput==locinput) {
3897 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3898 Perl_croak(aTHX_ "Infinite recursion in regex");
3899 if ( ++nochange_depth > max_nochange_depth )
3901 "Pattern subroutine nesting without pos change"
3902 " exceeded limit in regex");
3909 (void)ReREFCNT_inc(rex_sv);
3910 if (OP(scan)==GOSUB) {
3911 startpoint = scan + ARG2L(scan);
3912 ST.close_paren = ARG(scan);
3914 startpoint = rei->program+1;
3917 goto eval_recurse_doit;
3919 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
3920 if (cur_eval && cur_eval->locinput==locinput) {
3921 if ( ++nochange_depth > max_nochange_depth )
3922 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
3927 /* execute the code in the {...} */
3929 SV ** const before = SP;
3930 OP_4tree * const oop = PL_op;
3931 COP * const ocurcop = PL_curcop;
3933 char *saved_regeol = PL_regeol;
3936 PL_op = (OP_4tree*)rexi->data->data[n];
3937 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
3938 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
3939 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
3940 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
3943 SV *sv_mrk = get_sv("REGMARK", 1);
3944 sv_setsv(sv_mrk, sv_yes_mark);
3947 CALLRUNOPS(aTHX); /* Scalar context. */
3950 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
3957 PAD_RESTORE_LOCAL(old_comppad);
3958 PL_curcop = ocurcop;
3959 PL_regeol = saved_regeol;
3962 sv_setsv(save_scalar(PL_replgv), ret);
3966 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
3969 /* extract RE object from returned value; compiling if
3975 SV *const sv = SvRV(ret);
3977 if (SvTYPE(sv) == SVt_REGEXP) {
3979 } else if (SvSMAGICAL(sv)) {
3980 mg = mg_find(sv, PERL_MAGIC_qr);
3983 } else if (SvTYPE(ret) == SVt_REGEXP) {
3985 } else if (SvSMAGICAL(ret)) {
3986 if (SvGMAGICAL(ret)) {
3987 /* I don't believe that there is ever qr magic
3989 assert(!mg_find(ret, PERL_MAGIC_qr));
3990 sv_unmagic(ret, PERL_MAGIC_qr);
3993 mg = mg_find(ret, PERL_MAGIC_qr);
3994 /* testing suggests mg only ends up non-NULL for
3995 scalars who were upgraded and compiled in the
3996 else block below. In turn, this is only
3997 triggered in the "postponed utf8 string" tests
4003 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4007 rx = reg_temp_copy(NULL, rx);
4011 const I32 osize = PL_regsize;
4014 assert (SvUTF8(ret));
4015 } else if (SvUTF8(ret)) {
4016 /* Not doing UTF-8, despite what the SV says. Is
4017 this only if we're trapped in use 'bytes'? */
4018 /* Make a copy of the octet sequence, but without
4019 the flag on, as the compiler now honours the
4020 SvUTF8 flag on ret. */
4022 const char *const p = SvPV(ret, len);
4023 ret = newSVpvn_flags(p, len, SVs_TEMP);
4025 rx = CALLREGCOMP(ret, pm_flags);
4027 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4029 /* This isn't a first class regexp. Instead, it's
4030 caching a regexp onto an existing, Perl visible
4032 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4037 re = (struct regexp *)SvANY(rx);
4039 RXp_MATCH_COPIED_off(re);
4040 re->subbeg = rex->subbeg;
4041 re->sublen = rex->sublen;
4044 debug_start_match(re_sv, do_utf8, locinput, PL_regeol,
4045 "Matching embedded");
4047 startpoint = rei->program + 1;
4048 ST.close_paren = 0; /* only used for GOSUB */
4049 /* borrowed from regtry */
4050 if (PL_reg_start_tmpl <= re->nparens) {
4051 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4052 if(PL_reg_start_tmp)
4053 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4055 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4058 eval_recurse_doit: /* Share code with GOSUB below this line */
4059 /* run the pattern returned from (??{...}) */
4060 ST.cp = regcppush(0); /* Save *all* the positions. */
4061 REGCP_SET(ST.lastcp);
4063 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4065 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4066 PL_reglastparen = &re->lastparen;
4067 PL_reglastcloseparen = &re->lastcloseparen;
4069 re->lastcloseparen = 0;
4071 PL_reginput = locinput;
4074 /* XXXX This is too dramatic a measure... */
4077 ST.toggle_reg_flags = PL_reg_flags;
4079 PL_reg_flags |= RF_utf8;
4081 PL_reg_flags &= ~RF_utf8;
4082 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4084 ST.prev_rex = rex_sv;
4085 ST.prev_curlyx = cur_curlyx;
4086 SETREX(rex_sv,re_sv);
4091 ST.prev_eval = cur_eval;
4093 /* now continue from first node in postoned RE */
4094 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4097 /* logical is 1, /(?(?{...})X|Y)/ */
4098 sw = (bool)SvTRUE(ret);
4103 case EVAL_AB: /* cleanup after a successful (??{A})B */
4104 /* note: this is called twice; first after popping B, then A */
4105 PL_reg_flags ^= ST.toggle_reg_flags;
4106 ReREFCNT_dec(rex_sv);
4107 SETREX(rex_sv,ST.prev_rex);
4108 rex = (struct regexp *)SvANY(rex_sv);
4109 rexi = RXi_GET(rex);
4111 cur_eval = ST.prev_eval;
4112 cur_curlyx = ST.prev_curlyx;
4114 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4115 PL_reglastparen = &rex->lastparen;
4116 PL_reglastcloseparen = &rex->lastcloseparen;
4117 /* also update PL_regoffs */
4118 PL_regoffs = rex->offs;
4120 /* XXXX This is too dramatic a measure... */
4122 if ( nochange_depth )
4127 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4128 /* note: this is called twice; first after popping B, then A */
4129 PL_reg_flags ^= ST.toggle_reg_flags;
4130 ReREFCNT_dec(rex_sv);
4131 SETREX(rex_sv,ST.prev_rex);
4132 rex = (struct regexp *)SvANY(rex_sv);
4133 rexi = RXi_GET(rex);
4134 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4135 PL_reglastparen = &rex->lastparen;
4136 PL_reglastcloseparen = &rex->lastcloseparen;
4138 PL_reginput = locinput;
4139 REGCP_UNWIND(ST.lastcp);
4141 cur_eval = ST.prev_eval;
4142 cur_curlyx = ST.prev_curlyx;
4143 /* XXXX This is too dramatic a measure... */
4145 if ( nochange_depth )
4151 n = ARG(scan); /* which paren pair */
4152 PL_reg_start_tmp[n] = locinput;
4158 n = ARG(scan); /* which paren pair */
4159 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4160 PL_regoffs[n].end = locinput - PL_bostr;
4161 /*if (n > PL_regsize)
4163 if (n > *PL_reglastparen)
4164 *PL_reglastparen = n;
4165 *PL_reglastcloseparen = n;
4166 if (cur_eval && cur_eval->u.eval.close_paren == n) {
4174 cursor && OP(cursor)!=END;
4175 cursor=regnext(cursor))
4177 if ( OP(cursor)==CLOSE ){
4179 if ( n <= lastopen ) {
4181 = PL_reg_start_tmp[n] - PL_bostr;
4182 PL_regoffs[n].end = locinput - PL_bostr;
4183 /*if (n > PL_regsize)
4185 if (n > *PL_reglastparen)
4186 *PL_reglastparen = n;
4187 *PL_reglastcloseparen = n;
4188 if ( n == ARG(scan) || (cur_eval &&
4189 cur_eval->u.eval.close_paren == n))
4198 n = ARG(scan); /* which paren pair */
4199 sw = (bool)(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4202 /* reg_check_named_buff_matched returns 0 for no match */
4203 sw = (bool)(0 < reg_check_named_buff_matched(rex,scan));
4207 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4213 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4215 next = NEXTOPER(NEXTOPER(scan));
4217 next = scan + ARG(scan);
4218 if (OP(next) == IFTHEN) /* Fake one. */
4219 next = NEXTOPER(NEXTOPER(next));
4223 logical = scan->flags;
4226 /*******************************************************************
4228 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4229 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4230 STAR/PLUS/CURLY/CURLYN are used instead.)
4232 A*B is compiled as <CURLYX><A><WHILEM><B>
4234 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4235 state, which contains the current count, initialised to -1. It also sets
4236 cur_curlyx to point to this state, with any previous value saved in the
4239 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4240 since the pattern may possibly match zero times (i.e. it's a while {} loop
4241 rather than a do {} while loop).
4243 Each entry to WHILEM represents a successful match of A. The count in the
4244 CURLYX block is incremented, another WHILEM state is pushed, and execution
4245 passes to A or B depending on greediness and the current count.
4247 For example, if matching against the string a1a2a3b (where the aN are
4248 substrings that match /A/), then the match progresses as follows: (the
4249 pushed states are interspersed with the bits of strings matched so far):
4252 <CURLYX cnt=0><WHILEM>
4253 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4254 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4255 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4256 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4258 (Contrast this with something like CURLYM, which maintains only a single
4262 a1 <CURLYM cnt=1> a2
4263 a1 a2 <CURLYM cnt=2> a3
4264 a1 a2 a3 <CURLYM cnt=3> b
4267 Each WHILEM state block marks a point to backtrack to upon partial failure
4268 of A or B, and also contains some minor state data related to that
4269 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4270 overall state, such as the count, and pointers to the A and B ops.
4272 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4273 must always point to the *current* CURLYX block, the rules are:
4275 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4276 and set cur_curlyx to point the new block.
4278 When popping the CURLYX block after a successful or unsuccessful match,
4279 restore the previous cur_curlyx.
4281 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4282 to the outer one saved in the CURLYX block.
4284 When popping the WHILEM block after a successful or unsuccessful B match,
4285 restore the previous cur_curlyx.
4287 Here's an example for the pattern (AI* BI)*BO
4288 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4291 curlyx backtrack stack
4292 ------ ---------------
4294 CO <CO prev=NULL> <WO>
4295 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4296 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4297 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4299 At this point the pattern succeeds, and we work back down the stack to
4300 clean up, restoring as we go:
4302 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4303 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4304 CO <CO prev=NULL> <WO>
4307 *******************************************************************/
4309 #define ST st->u.curlyx
4311 case CURLYX: /* start of /A*B/ (for complex A) */
4313 /* No need to save/restore up to this paren */
4314 I32 parenfloor = scan->flags;
4316 assert(next); /* keep Coverity happy */
4317 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4320 /* XXXX Probably it is better to teach regpush to support
4321 parenfloor > PL_regsize... */
4322 if (parenfloor > (I32)*PL_reglastparen)
4323 parenfloor = *PL_reglastparen; /* Pessimization... */
4325 ST.prev_curlyx= cur_curlyx;
4327 ST.cp = PL_savestack_ix;
4329 /* these fields contain the state of the current curly.
4330 * they are accessed by subsequent WHILEMs */
4331 ST.parenfloor = parenfloor;
4332 ST.min = ARG1(scan);
4333 ST.max = ARG2(scan);
4334 ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4338 ST.count = -1; /* this will be updated by WHILEM */
4339 ST.lastloc = NULL; /* this will be updated by WHILEM */
4341 PL_reginput = locinput;
4342 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4346 case CURLYX_end: /* just finished matching all of A*B */
4347 cur_curlyx = ST.prev_curlyx;
4351 case CURLYX_end_fail: /* just failed to match all of A*B */
4353 cur_curlyx = ST.prev_curlyx;
4359 #define ST st->u.whilem
4361 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4363 /* see the discussion above about CURLYX/WHILEM */
4365 assert(cur_curlyx); /* keep Coverity happy */
4366 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4367 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4368 ST.cache_offset = 0;
4371 PL_reginput = locinput;
4373 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4374 "%*s whilem: matched %ld out of %ld..%ld\n",
4375 REPORT_CODE_OFF+depth*2, "", (long)n,
4376 (long)cur_curlyx->u.curlyx.min,
4377 (long)cur_curlyx->u.curlyx.max)
4380 /* First just match a string of min A's. */
4382 if (n < cur_curlyx->u.curlyx.min) {
4383 cur_curlyx->u.curlyx.lastloc = locinput;
4384 PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A);
4388 /* If degenerate A matches "", assume A done. */
4390 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4391 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4392 "%*s whilem: empty match detected, trying continuation...\n",
4393 REPORT_CODE_OFF+depth*2, "")
4395 goto do_whilem_B_max;
4398 /* super-linear cache processing */
4402 if (!PL_reg_maxiter) {
4403 /* start the countdown: Postpone detection until we
4404 * know the match is not *that* much linear. */
4405 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4406 /* possible overflow for long strings and many CURLYX's */
4407 if (PL_reg_maxiter < 0)
4408 PL_reg_maxiter = I32_MAX;
4409 PL_reg_leftiter = PL_reg_maxiter;
4412 if (PL_reg_leftiter-- == 0) {
4413 /* initialise cache */
4414 const I32 size = (PL_reg_maxiter + 7)/8;
4415 if (PL_reg_poscache) {
4416 if ((I32)PL_reg_poscache_size < size) {
4417 Renew(PL_reg_poscache, size, char);
4418 PL_reg_poscache_size = size;
4420 Zero(PL_reg_poscache, size, char);
4423 PL_reg_poscache_size = size;
4424 Newxz(PL_reg_poscache, size, char);
4426 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4427 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4428 PL_colors[4], PL_colors[5])
4432 if (PL_reg_leftiter < 0) {
4433 /* have we already failed at this position? */
4435 offset = (scan->flags & 0xf) - 1
4436 + (locinput - PL_bostr) * (scan->flags>>4);
4437 mask = 1 << (offset % 8);
4439 if (PL_reg_poscache[offset] & mask) {
4440 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4441 "%*s whilem: (cache) already tried at this position...\n",
4442 REPORT_CODE_OFF+depth*2, "")
4444 sayNO; /* cache records failure */
4446 ST.cache_offset = offset;
4447 ST.cache_mask = mask;
4451 /* Prefer B over A for minimal matching. */
4453 if (cur_curlyx->u.curlyx.minmod) {
4454 ST.save_curlyx = cur_curlyx;
4455 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4456 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4457 REGCP_SET(ST.lastcp);
4458 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4462 /* Prefer A over B for maximal matching. */
4464 if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */
4465 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4466 cur_curlyx->u.curlyx.lastloc = locinput;
4467 REGCP_SET(ST.lastcp);
4468 PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A);
4471 goto do_whilem_B_max;
4475 case WHILEM_B_min: /* just matched B in a minimal match */
4476 case WHILEM_B_max: /* just matched B in a maximal match */
4477 cur_curlyx = ST.save_curlyx;
4481 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4482 cur_curlyx = ST.save_curlyx;
4483 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4484 cur_curlyx->u.curlyx.count--;
4488 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4489 REGCP_UNWIND(ST.lastcp);
4492 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4493 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4494 cur_curlyx->u.curlyx.count--;
4498 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4499 REGCP_UNWIND(ST.lastcp);
4500 regcppop(rex); /* Restore some previous $<digit>s? */
4501 PL_reginput = locinput;
4502 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4503 "%*s whilem: failed, trying continuation...\n",
4504 REPORT_CODE_OFF+depth*2, "")
4507 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4508 && ckWARN(WARN_REGEXP)
4509 && !(PL_reg_flags & RF_warned))
4511 PL_reg_flags |= RF_warned;
4512 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4513 "Complex regular subexpression recursion",
4518 ST.save_curlyx = cur_curlyx;
4519 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4520 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4523 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4524 cur_curlyx = ST.save_curlyx;
4525 REGCP_UNWIND(ST.lastcp);
4528 if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) {
4529 /* Maximum greed exceeded */
4530 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4531 && ckWARN(WARN_REGEXP)
4532 && !(PL_reg_flags & RF_warned))
4534 PL_reg_flags |= RF_warned;
4535 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4536 "%s limit (%d) exceeded",
4537 "Complex regular subexpression recursion",
4540 cur_curlyx->u.curlyx.count--;
4544 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4545 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4547 /* Try grabbing another A and see if it helps. */
4548 PL_reginput = locinput;
4549 cur_curlyx->u.curlyx.lastloc = locinput;
4550 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4551 REGCP_SET(ST.lastcp);
4552 PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A);
4556 #define ST st->u.branch
4558 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4559 next = scan + ARG(scan);
4562 scan = NEXTOPER(scan);
4565 case BRANCH: /* /(...|A|...)/ */
4566 scan = NEXTOPER(scan); /* scan now points to inner node */
4567 ST.lastparen = *PL_reglastparen;
4568 ST.next_branch = next;
4570 PL_reginput = locinput;
4572 /* Now go into the branch */
4574 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4576 PUSH_STATE_GOTO(BRANCH_next, scan);
4580 PL_reginput = locinput;
4581 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4582 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4583 PUSH_STATE_GOTO(CUTGROUP_next,next);
4585 case CUTGROUP_next_fail:
4588 if (st->u.mark.mark_name)
4589 sv_commit = st->u.mark.mark_name;
4595 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4600 REGCP_UNWIND(ST.cp);
4601 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4602 PL_regoffs[n].end = -1;
4603 *PL_reglastparen = n;
4604 /*dmq: *PL_reglastcloseparen = n; */
4605 scan = ST.next_branch;
4606 /* no more branches? */
4607 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4609 PerlIO_printf( Perl_debug_log,
4610 "%*s %sBRANCH failed...%s\n",
4611 REPORT_CODE_OFF+depth*2, "",
4617 continue; /* execute next BRANCH[J] op */
4625 #define ST st->u.curlym
4627 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4629 /* This is an optimisation of CURLYX that enables us to push
4630 * only a single backtracking state, no matter how many matches
4631 * there are in {m,n}. It relies on the pattern being constant
4632 * length, with no parens to influence future backrefs
4636 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4638 /* if paren positive, emulate an OPEN/CLOSE around A */
4640 U32 paren = ST.me->flags;
4641 if (paren > PL_regsize)
4643 if (paren > *PL_reglastparen)
4644 *PL_reglastparen = paren;
4645 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4653 ST.c1 = CHRTEST_UNINIT;
4656 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4659 curlym_do_A: /* execute the A in /A{m,n}B/ */
4660 PL_reginput = locinput;
4661 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4664 case CURLYM_A: /* we've just matched an A */
4665 locinput = st->locinput;
4666 nextchr = UCHARAT(locinput);
4669 /* after first match, determine A's length: u.curlym.alen */
4670 if (ST.count == 1) {
4671 if (PL_reg_match_utf8) {
4673 while (s < PL_reginput) {
4679 ST.alen = PL_reginput - locinput;
4682 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4685 PerlIO_printf(Perl_debug_log,
4686 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4687 (int)(REPORT_CODE_OFF+(depth*2)), "",
4688 (IV) ST.count, (IV)ST.alen)
4691 locinput = PL_reginput;
4693 if (cur_eval && cur_eval->u.eval.close_paren &&
4694 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4698 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4699 if ( max == REG_INFTY || ST.count < max )
4700 goto curlym_do_A; /* try to match another A */
4702 goto curlym_do_B; /* try to match B */
4704 case CURLYM_A_fail: /* just failed to match an A */
4705 REGCP_UNWIND(ST.cp);
4707 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4708 || (cur_eval && cur_eval->u.eval.close_paren &&
4709 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4712 curlym_do_B: /* execute the B in /A{m,n}B/ */
4713 PL_reginput = locinput;
4714 if (ST.c1 == CHRTEST_UNINIT) {
4715 /* calculate c1 and c2 for possible match of 1st char
4716 * following curly */
4717 ST.c1 = ST.c2 = CHRTEST_VOID;
4718 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4719 regnode *text_node = ST.B;
4720 if (! HAS_TEXT(text_node))
4721 FIND_NEXT_IMPT(text_node);
4724 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4726 But the former is redundant in light of the latter.
4728 if this changes back then the macro for
4729 IS_TEXT and friends need to change.
4731 if (PL_regkind[OP(text_node)] == EXACT)
4734 ST.c1 = (U8)*STRING(text_node);
4736 (IS_TEXTF(text_node))
4738 : (IS_TEXTFL(text_node))
4739 ? PL_fold_locale[ST.c1]
4746 PerlIO_printf(Perl_debug_log,
4747 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4748 (int)(REPORT_CODE_OFF+(depth*2)),
4751 if (ST.c1 != CHRTEST_VOID
4752 && UCHARAT(PL_reginput) != ST.c1
4753 && UCHARAT(PL_reginput) != ST.c2)
4755 /* simulate B failing */
4757 PerlIO_printf(Perl_debug_log,
4758 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4759 (int)(REPORT_CODE_OFF+(depth*2)),"",
4762 state_num = CURLYM_B_fail;
4763 goto reenter_switch;
4767 /* mark current A as captured */
4768 I32 paren = ST.me->flags;
4770 PL_regoffs[paren].start
4771 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4772 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4773 /*dmq: *PL_reglastcloseparen = paren; */
4776 PL_regoffs[paren].end = -1;
4777 if (cur_eval && cur_eval->u.eval.close_paren &&
4778 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4787 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4790 case CURLYM_B_fail: /* just failed to match a B */
4791 REGCP_UNWIND(ST.cp);
4793 I32 max = ARG2(ST.me);
4794 if (max != REG_INFTY && ST.count == max)
4796 goto curlym_do_A; /* try to match a further A */
4798 /* backtrack one A */
4799 if (ST.count == ARG1(ST.me) /* min */)
4802 locinput = HOPc(locinput, -ST.alen);
4803 goto curlym_do_B; /* try to match B */
4806 #define ST st->u.curly
4808 #define CURLY_SETPAREN(paren, success) \
4811 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4812 PL_regoffs[paren].end = locinput - PL_bostr; \
4813 *PL_reglastcloseparen = paren; \
4816 PL_regoffs[paren].end = -1; \
4819 case STAR: /* /A*B/ where A is width 1 */
4823 scan = NEXTOPER(scan);
4825 case PLUS: /* /A+B/ where A is width 1 */
4829 scan = NEXTOPER(scan);
4831 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4832 ST.paren = scan->flags; /* Which paren to set */
4833 if (ST.paren > PL_regsize)
4834 PL_regsize = ST.paren;
4835 if (ST.paren > *PL_reglastparen)
4836 *PL_reglastparen = ST.paren;
4837 ST.min = ARG1(scan); /* min to match */
4838 ST.max = ARG2(scan); /* max to match */
4839 if (cur_eval && cur_eval->u.eval.close_paren &&
4840 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4844 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4846 case CURLY: /* /A{m,n}B/ where A is width 1 */
4848 ST.min = ARG1(scan); /* min to match */
4849 ST.max = ARG2(scan); /* max to match */
4850 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4853 * Lookahead to avoid useless match attempts
4854 * when we know what character comes next.
4856 * Used to only do .*x and .*?x, but now it allows
4857 * for )'s, ('s and (?{ ... })'s to be in the way
4858 * of the quantifier and the EXACT-like node. -- japhy
4861 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4863 if (HAS_TEXT(next) || JUMPABLE(next)) {
4865 regnode *text_node = next;
4867 if (! HAS_TEXT(text_node))
4868 FIND_NEXT_IMPT(text_node);
4870 if (! HAS_TEXT(text_node))
4871 ST.c1 = ST.c2 = CHRTEST_VOID;
4873 if ( PL_regkind[OP(text_node)] != EXACT ) {
4874 ST.c1 = ST.c2 = CHRTEST_VOID;
4875 goto assume_ok_easy;
4878 s = (U8*)STRING(text_node);
4880 /* Currently we only get here when
4882 PL_rekind[OP(text_node)] == EXACT
4884 if this changes back then the macro for IS_TEXT and
4885 friends need to change. */
4888 if (IS_TEXTF(text_node))
4889 ST.c2 = PL_fold[ST.c1];
4890 else if (IS_TEXTFL(text_node))
4891 ST.c2 = PL_fold_locale[ST.c1];
4894 if (IS_TEXTF(text_node)) {
4895 STRLEN ulen1, ulen2;
4896 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
4897 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
4899 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
4900 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
4902 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
4904 0 : UTF8_ALLOW_ANY);
4905 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
4907 0 : UTF8_ALLOW_ANY);
4909 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
4911 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
4916 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
4923 ST.c1 = ST.c2 = CHRTEST_VOID;
4928 PL_reginput = locinput;
4931 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
4934 locinput = PL_reginput;
4936 if (ST.c1 == CHRTEST_VOID)
4937 goto curly_try_B_min;
4939 ST.oldloc = locinput;
4941 /* set ST.maxpos to the furthest point along the
4942 * string that could possibly match */
4943 if (ST.max == REG_INFTY) {
4944 ST.maxpos = PL_regeol - 1;
4946 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
4950 int m = ST.max - ST.min;
4951 for (ST.maxpos = locinput;
4952 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
4953 ST.maxpos += UTF8SKIP(ST.maxpos);
4956 ST.maxpos = locinput + ST.max - ST.min;
4957 if (ST.maxpos >= PL_regeol)
4958 ST.maxpos = PL_regeol - 1;
4960 goto curly_try_B_min_known;
4964 ST.count = regrepeat(rex, ST.A, ST.max, depth);
4965 locinput = PL_reginput;
4966 if (ST.count < ST.min)
4968 if ((ST.count > ST.min)
4969 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
4971 /* A{m,n} must come at the end of the string, there's
4972 * no point in backing off ... */
4974 /* ...except that $ and \Z can match before *and* after
4975 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
4976 We may back off by one in this case. */
4977 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
4981 goto curly_try_B_max;
4986 case CURLY_B_min_known_fail:
4987 /* failed to find B in a non-greedy match where c1,c2 valid */
4988 if (ST.paren && ST.count)
4989 PL_regoffs[ST.paren].end = -1;
4991 PL_reginput = locinput; /* Could be reset... */
4992 REGCP_UNWIND(ST.cp);
4993 /* Couldn't or didn't -- move forward. */
4994 ST.oldloc = locinput;
4996 locinput += UTF8SKIP(locinput);
5000 curly_try_B_min_known:
5001 /* find the next place where 'B' could work, then call B */
5005 n = (ST.oldloc == locinput) ? 0 : 1;
5006 if (ST.c1 == ST.c2) {
5008 /* set n to utf8_distance(oldloc, locinput) */
5009 while (locinput <= ST.maxpos &&
5010 utf8n_to_uvchr((U8*)locinput,
5011 UTF8_MAXBYTES, &len,
5012 uniflags) != (UV)ST.c1) {
5018 /* set n to utf8_distance(oldloc, locinput) */
5019 while (locinput <= ST.maxpos) {
5021 const UV c = utf8n_to_uvchr((U8*)locinput,
5022 UTF8_MAXBYTES, &len,
5024 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5032 if (ST.c1 == ST.c2) {
5033 while (locinput <= ST.maxpos &&
5034 UCHARAT(locinput) != ST.c1)
5038 while (locinput <= ST.maxpos
5039 && UCHARAT(locinput) != ST.c1
5040 && UCHARAT(locinput) != ST.c2)
5043 n = locinput - ST.oldloc;
5045 if (locinput > ST.maxpos)
5047 /* PL_reginput == oldloc now */
5050 if (regrepeat(rex, ST.A, n, depth) < n)
5053 PL_reginput = locinput;
5054 CURLY_SETPAREN(ST.paren, ST.count);
5055 if (cur_eval && cur_eval->u.eval.close_paren &&
5056 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5059 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5064 case CURLY_B_min_fail:
5065 /* failed to find B in a non-greedy match where c1,c2 invalid */
5066 if (ST.paren && ST.count)
5067 PL_regoffs[ST.paren].end = -1;
5069 REGCP_UNWIND(ST.cp);
5070 /* failed -- move forward one */
5071 PL_reginput = locinput;
5072 if (regrepeat(rex, ST.A, 1, depth)) {
5074 locinput = PL_reginput;
5075 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5076 ST.count > 0)) /* count overflow ? */
5079 CURLY_SETPAREN(ST.paren, ST.count);
5080 if (cur_eval && cur_eval->u.eval.close_paren &&
5081 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5084 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5092 /* a successful greedy match: now try to match B */
5093 if (cur_eval && cur_eval->u.eval.close_paren &&
5094 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5099 if (ST.c1 != CHRTEST_VOID)
5100 c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput,
5101 UTF8_MAXBYTES, 0, uniflags)
5102 : (UV) UCHARAT(PL_reginput);
5103 /* If it could work, try it. */
5104 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5105 CURLY_SETPAREN(ST.paren, ST.count);
5106 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5111 case CURLY_B_max_fail:
5112 /* failed to find B in a greedy match */
5113 if (ST.paren && ST.count)
5114 PL_regoffs[ST.paren].end = -1;
5116 REGCP_UNWIND(ST.cp);
5118 if (--ST.count < ST.min)
5120 PL_reginput = locinput = HOPc(locinput, -1);
5121 goto curly_try_B_max;
5128 /* we've just finished A in /(??{A})B/; now continue with B */
5130 st->u.eval.toggle_reg_flags
5131 = cur_eval->u.eval.toggle_reg_flags;
5132 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5134 st->u.eval.prev_rex = rex_sv; /* inner */
5135 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5136 rex = (struct regexp *)SvANY(rex_sv);
5137 rexi = RXi_GET(rex);
5138 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5139 ReREFCNT_inc(rex_sv);
5140 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5142 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5143 PL_reglastparen = &rex->lastparen;
5144 PL_reglastcloseparen = &rex->lastcloseparen;
5146 REGCP_SET(st->u.eval.lastcp);
5147 PL_reginput = locinput;
5149 /* Restore parens of the outer rex without popping the
5151 tmpix = PL_savestack_ix;
5152 PL_savestack_ix = cur_eval->u.eval.lastcp;
5154 PL_savestack_ix = tmpix;
5156 st->u.eval.prev_eval = cur_eval;
5157 cur_eval = cur_eval->u.eval.prev_eval;
5159 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5160 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5161 if ( nochange_depth )
5164 PUSH_YES_STATE_GOTO(EVAL_AB,
5165 st->u.eval.prev_eval->u.eval.B); /* match B */
5168 if (locinput < reginfo->till) {
5169 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5170 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5172 (long)(locinput - PL_reg_starttry),
5173 (long)(reginfo->till - PL_reg_starttry),
5176 sayNO_SILENT; /* Cannot match: too short. */
5178 PL_reginput = locinput; /* put where regtry can find it */
5179 sayYES; /* Success! */
5181 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5183 PerlIO_printf(Perl_debug_log,
5184 "%*s %ssubpattern success...%s\n",
5185 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5186 PL_reginput = locinput; /* put where regtry can find it */
5187 sayYES; /* Success! */
5190 #define ST st->u.ifmatch
5192 case SUSPEND: /* (?>A) */
5194 PL_reginput = locinput;
5197 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5199 goto ifmatch_trivial_fail_test;
5201 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5203 ifmatch_trivial_fail_test:
5205 char * const s = HOPBACKc(locinput, scan->flags);
5210 sw = 1 - (bool)ST.wanted;
5214 next = scan + ARG(scan);
5222 PL_reginput = locinput;
5226 ST.logical = logical;
5227 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5229 /* execute body of (?...A) */
5230 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5233 case IFMATCH_A_fail: /* body of (?...A) failed */
5234 ST.wanted = !ST.wanted;
5237 case IFMATCH_A: /* body of (?...A) succeeded */
5239 sw = (bool)ST.wanted;
5241 else if (!ST.wanted)
5244 if (OP(ST.me) == SUSPEND)
5245 locinput = PL_reginput;
5247 locinput = PL_reginput = st->locinput;
5248 nextchr = UCHARAT(locinput);
5250 scan = ST.me + ARG(ST.me);
5253 continue; /* execute B */
5258 next = scan + ARG(scan);
5263 reginfo->cutpoint = PL_regeol;
5266 PL_reginput = locinput;
5268 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5269 PUSH_STATE_GOTO(COMMIT_next,next);
5271 case COMMIT_next_fail:
5278 #define ST st->u.mark
5280 ST.prev_mark = mark_state;
5281 ST.mark_name = sv_commit = sv_yes_mark
5282 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5284 ST.mark_loc = PL_reginput = locinput;
5285 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5287 case MARKPOINT_next:
5288 mark_state = ST.prev_mark;
5291 case MARKPOINT_next_fail:
5292 if (popmark && sv_eq(ST.mark_name,popmark))
5294 if (ST.mark_loc > startpoint)
5295 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5296 popmark = NULL; /* we found our mark */
5297 sv_commit = ST.mark_name;
5300 PerlIO_printf(Perl_debug_log,
5301 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5302 REPORT_CODE_OFF+depth*2, "",
5303 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5306 mark_state = ST.prev_mark;
5307 sv_yes_mark = mark_state ?
5308 mark_state->u.mark.mark_name : NULL;
5312 PL_reginput = locinput;
5314 /* (*SKIP) : if we fail we cut here*/
5315 ST.mark_name = NULL;
5316 ST.mark_loc = locinput;
5317 PUSH_STATE_GOTO(SKIP_next,next);
5319 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5320 otherwise do nothing. Meaning we need to scan
5322 regmatch_state *cur = mark_state;
5323 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5326 if ( sv_eq( cur->u.mark.mark_name,
5329 ST.mark_name = find;
5330 PUSH_STATE_GOTO( SKIP_next, next );
5332 cur = cur->u.mark.prev_mark;
5335 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5337 case SKIP_next_fail:
5339 /* (*CUT:NAME) - Set up to search for the name as we
5340 collapse the stack*/
5341 popmark = ST.mark_name;
5343 /* (*CUT) - No name, we cut here.*/
5344 if (ST.mark_loc > startpoint)
5345 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5346 /* but we set sv_commit to latest mark_name if there
5347 is one so they can test to see how things lead to this
5350 sv_commit=mark_state->u.mark.mark_name;
5358 if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) {
5360 } else if ( 0xDF == n && !do_utf8 && !UTF ) {
5363 U8 folded[UTF8_MAXBYTES_CASE+1];
5365 const char * const l = locinput;
5366 char *e = PL_regeol;
5367 to_uni_fold(n, folded, &foldlen);
5369 if (ibcmp_utf8((const char*) folded, 0, foldlen, 1,
5370 l, &e, 0, do_utf8)) {
5375 nextchr = UCHARAT(locinput);
5378 if ((n=is_LNBREAK(locinput,do_utf8))) {
5380 nextchr = UCHARAT(locinput);
5385 #define CASE_CLASS(nAmE) \
5387 if ((n=is_##nAmE(locinput,do_utf8))) { \
5389 nextchr = UCHARAT(locinput); \
5394 if ((n=is_##nAmE(locinput,do_utf8))) { \
5397 locinput += UTF8SKIP(locinput); \
5398 nextchr = UCHARAT(locinput); \
5403 CASE_CLASS(HORIZWS);
5407 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5408 PTR2UV(scan), OP(scan));
5409 Perl_croak(aTHX_ "regexp memory corruption");
5413 /* switch break jumps here */
5414 scan = next; /* prepare to execute the next op and ... */
5415 continue; /* ... jump back to the top, reusing st */
5419 /* push a state that backtracks on success */
5420 st->u.yes.prev_yes_state = yes_state;
5424 /* push a new regex state, then continue at scan */
5426 regmatch_state *newst;
5429 regmatch_state *cur = st;
5430 regmatch_state *curyes = yes_state;
5432 regmatch_slab *slab = PL_regmatch_slab;
5433 for (;curd > -1;cur--,curd--) {
5434 if (cur < SLAB_FIRST(slab)) {
5436 cur = SLAB_LAST(slab);
5438 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5439 REPORT_CODE_OFF + 2 + depth * 2,"",
5440 curd, PL_reg_name[cur->resume_state],
5441 (curyes == cur) ? "yes" : ""
5444 curyes = cur->u.yes.prev_yes_state;
5447 DEBUG_STATE_pp("push")
5450 st->locinput = locinput;
5452 if (newst > SLAB_LAST(PL_regmatch_slab))
5453 newst = S_push_slab(aTHX);
5454 PL_regmatch_state = newst;
5456 locinput = PL_reginput;
5457 nextchr = UCHARAT(locinput);
5465 * We get here only if there's trouble -- normally "case END" is
5466 * the terminating point.
5468 Perl_croak(aTHX_ "corrupted regexp pointers");
5474 /* we have successfully completed a subexpression, but we must now
5475 * pop to the state marked by yes_state and continue from there */
5476 assert(st != yes_state);
5478 while (st != yes_state) {
5480 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5481 PL_regmatch_slab = PL_regmatch_slab->prev;
5482 st = SLAB_LAST(PL_regmatch_slab);
5486 DEBUG_STATE_pp("pop (no final)");
5488 DEBUG_STATE_pp("pop (yes)");
5494 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5495 || yes_state > SLAB_LAST(PL_regmatch_slab))
5497 /* not in this slab, pop slab */
5498 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5499 PL_regmatch_slab = PL_regmatch_slab->prev;
5500 st = SLAB_LAST(PL_regmatch_slab);
5502 depth -= (st - yes_state);
5505 yes_state = st->u.yes.prev_yes_state;
5506 PL_regmatch_state = st;
5509 locinput= st->locinput;
5510 nextchr = UCHARAT(locinput);
5512 state_num = st->resume_state + no_final;
5513 goto reenter_switch;
5516 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5517 PL_colors[4], PL_colors[5]));
5519 if (PL_reg_eval_set) {
5520 /* each successfully executed (?{...}) block does the equivalent of
5521 * local $^R = do {...}
5522 * When popping the save stack, all these locals would be undone;
5523 * bypass this by setting the outermost saved $^R to the latest
5525 if (oreplsv != GvSV(PL_replgv))
5526 sv_setsv(oreplsv, GvSV(PL_replgv));
5533 PerlIO_printf(Perl_debug_log,
5534 "%*s %sfailed...%s\n",
5535 REPORT_CODE_OFF+depth*2, "",
5536 PL_colors[4], PL_colors[5])
5548 /* there's a previous state to backtrack to */
5550 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5551 PL_regmatch_slab = PL_regmatch_slab->prev;
5552 st = SLAB_LAST(PL_regmatch_slab);
5554 PL_regmatch_state = st;
5555 locinput= st->locinput;
5556 nextchr = UCHARAT(locinput);
5558 DEBUG_STATE_pp("pop");
5560 if (yes_state == st)
5561 yes_state = st->u.yes.prev_yes_state;
5563 state_num = st->resume_state + 1; /* failure = success + 1 */
5564 goto reenter_switch;
5569 if (rex->intflags & PREGf_VERBARG_SEEN) {
5570 SV *sv_err = get_sv("REGERROR", 1);
5571 SV *sv_mrk = get_sv("REGMARK", 1);
5573 sv_commit = &PL_sv_no;
5575 sv_yes_mark = &PL_sv_yes;
5578 sv_commit = &PL_sv_yes;
5579 sv_yes_mark = &PL_sv_no;
5581 sv_setsv(sv_err, sv_commit);
5582 sv_setsv(sv_mrk, sv_yes_mark);
5585 /* clean up; in particular, free all slabs above current one */
5586 LEAVE_SCOPE(oldsave);
5592 - regrepeat - repeatedly match something simple, report how many
5595 * [This routine now assumes that it will only match on things of length 1.
5596 * That was true before, but now we assume scan - reginput is the count,
5597 * rather than incrementing count on every character. [Er, except utf8.]]
5600 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5603 register char *scan;
5605 register char *loceol = PL_regeol;
5606 register I32 hardcount = 0;
5607 register bool do_utf8 = PL_reg_match_utf8;
5609 PERL_UNUSED_ARG(depth);
5612 PERL_ARGS_ASSERT_REGREPEAT;
5615 if (max == REG_INFTY)
5617 else if (max < loceol - scan)
5618 loceol = scan + max;
5623 while (scan < loceol && hardcount < max && *scan != '\n') {
5624 scan += UTF8SKIP(scan);
5628 while (scan < loceol && *scan != '\n')
5635 while (scan < loceol && hardcount < max) {
5636 scan += UTF8SKIP(scan);
5646 case EXACT: /* length of string is 1 */
5648 while (scan < loceol && UCHARAT(scan) == c)
5651 case EXACTF: /* length of string is 1 */
5653 while (scan < loceol &&
5654 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5657 case EXACTFL: /* length of string is 1 */
5658 PL_reg_flags |= RF_tainted;
5660 while (scan < loceol &&
5661 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5667 while (hardcount < max && scan < loceol &&
5668 reginclass(prog, p, (U8*)scan, 0, do_utf8)) {
5669 scan += UTF8SKIP(scan);
5673 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5680 LOAD_UTF8_CHARCLASS_ALNUM();
5681 while (hardcount < max && scan < loceol &&
5682 swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5683 scan += UTF8SKIP(scan);
5687 while (scan < loceol && isALNUM(*scan))
5692 PL_reg_flags |= RF_tainted;
5695 while (hardcount < max && scan < loceol &&
5696 isALNUM_LC_utf8((U8*)scan)) {
5697 scan += UTF8SKIP(scan);
5701 while (scan < loceol && isALNUM_LC(*scan))
5708 LOAD_UTF8_CHARCLASS_ALNUM();
5709 while (hardcount < max && scan < loceol &&
5710 !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) {
5711 scan += UTF8SKIP(scan);
5715 while (scan < loceol && !isALNUM(*scan))
5720 PL_reg_flags |= RF_tainted;
5723 while (hardcount < max && scan < loceol &&
5724 !isALNUM_LC_utf8((U8*)scan)) {
5725 scan += UTF8SKIP(scan);
5729 while (scan < loceol && !isALNUM_LC(*scan))
5736 LOAD_UTF8_CHARCLASS_SPACE();
5737 while (hardcount < max && scan < loceol &&
5739 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5740 scan += UTF8SKIP(scan);
5744 while (scan < loceol && isSPACE(*scan))
5749 PL_reg_flags |= RF_tainted;
5752 while (hardcount < max && scan < loceol &&
5753 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5754 scan += UTF8SKIP(scan);
5758 while (scan < loceol && isSPACE_LC(*scan))
5765 LOAD_UTF8_CHARCLASS_SPACE();
5766 while (hardcount < max && scan < loceol &&
5768 swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) {
5769 scan += UTF8SKIP(scan);
5773 while (scan < loceol && !isSPACE(*scan))
5778 PL_reg_flags |= RF_tainted;
5781 while (hardcount < max && scan < loceol &&
5782 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5783 scan += UTF8SKIP(scan);
5787 while (scan < loceol && !isSPACE_LC(*scan))
5794 LOAD_UTF8_CHARCLASS_DIGIT();
5795 while (hardcount < max && scan < loceol &&
5796 swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5797 scan += UTF8SKIP(scan);
5801 while (scan < loceol && isDIGIT(*scan))
5808 LOAD_UTF8_CHARCLASS_DIGIT();
5809 while (hardcount < max && scan < loceol &&
5810 !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) {
5811 scan += UTF8SKIP(scan);
5815 while (scan < loceol && !isDIGIT(*scan))
5821 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5827 LNBREAK can match two latin chars, which is ok,
5828 because we have a null terminated string, but we
5829 have to use hardcount in this situation
5831 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5840 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5845 while (scan < loceol && is_HORIZWS_latin1(scan))
5852 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5853 scan += UTF8SKIP(scan);
5857 while (scan < loceol && !is_HORIZWS_latin1(scan))
5865 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
5870 while (scan < loceol && is_VERTWS_latin1(scan))
5878 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
5879 scan += UTF8SKIP(scan);
5883 while (scan < loceol && !is_VERTWS_latin1(scan))
5889 default: /* Called on something of 0 width. */
5890 break; /* So match right here or not at all. */
5896 c = scan - PL_reginput;
5900 GET_RE_DEBUG_FLAGS_DECL;
5902 SV * const prop = sv_newmortal();
5903 regprop(prog, prop, p);
5904 PerlIO_printf(Perl_debug_log,
5905 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
5906 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
5914 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
5916 - regclass_swash - prepare the utf8 swash
5920 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
5926 RXi_GET_DECL(prog,progi);
5927 const struct reg_data * const data = prog ? progi->data : NULL;
5929 PERL_ARGS_ASSERT_REGCLASS_SWASH;
5931 if (data && data->count) {
5932 const U32 n = ARG(node);
5934 if (data->what[n] == 's') {
5935 SV * const rv = MUTABLE_SV(data->data[n]);
5936 AV * const av = MUTABLE_AV(SvRV(rv));
5937 SV **const ary = AvARRAY(av);
5940 /* See the end of regcomp.c:S_regclass() for
5941 * documentation of these array elements. */
5944 a = SvROK(ary[1]) ? &ary[1] : NULL;
5945 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
5949 else if (si && doinit) {
5950 sw = swash_init("utf8", "", si, 1, 0);
5951 (void)av_store(av, 1, sw);
5968 - reginclass - determine if a character falls into a character class
5970 The n is the ANYOF regnode, the p is the target string, lenp
5971 is pointer to the maximum length of how far to go in the p
5972 (if the lenp is zero, UTF8SKIP(p) is used),
5973 do_utf8 tells whether the target string is in UTF-8.
5978 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8)
5981 const char flags = ANYOF_FLAGS(n);
5987 PERL_ARGS_ASSERT_REGINCLASS;
5989 if (do_utf8 && !UTF8_IS_INVARIANT(c)) {
5990 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
5991 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
5992 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
5993 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
5994 * UTF8_ALLOW_FFFF */
5995 if (len == (STRLEN)-1)
5996 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
5999 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
6000 if (do_utf8 || (flags & ANYOF_UNICODE)) {
6003 if (do_utf8 && !ANYOF_RUNTIME(n)) {
6004 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6007 if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6011 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6019 utf8_p = bytes_to_utf8(p, &len);
6021 if (swash_fetch(sw, utf8_p, 1))
6023 else if (flags & ANYOF_FOLD) {
6024 if (!match && lenp && av) {
6026 for (i = 0; i <= av_len(av); i++) {
6027 SV* const sv = *av_fetch(av, i, FALSE);
6029 const char * const s = SvPV_const(sv, len);
6030 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6038 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6041 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6042 if (swash_fetch(sw, tmpbuf, 1))
6047 /* If we allocated a string above, free it */
6048 if (! do_utf8) Safefree(utf8_p);
6051 if (match && lenp && *lenp == 0)
6052 *lenp = UNISKIP(NATIVE_TO_UNI(c));
6054 if (!match && c < 256) {
6055 if (ANYOF_BITMAP_TEST(n, c))
6057 else if (flags & ANYOF_FOLD) {
6060 if (flags & ANYOF_LOCALE) {
6061 PL_reg_flags |= RF_tainted;
6062 f = PL_fold_locale[c];
6066 if (f != c && ANYOF_BITMAP_TEST(n, f))
6070 if (!match && (flags & ANYOF_CLASS)) {
6071 PL_reg_flags |= RF_tainted;
6073 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6074 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6075 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6076 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6077 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6078 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6079 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6080 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6081 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6082 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6083 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
6084 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
6085 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6086 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6087 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6088 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6089 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6090 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6091 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6092 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6093 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6094 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6095 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6096 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6097 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6098 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6099 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6100 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6101 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
6102 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
6103 ) /* How's that for a conditional? */
6110 return (flags & ANYOF_INVERT) ? !match : match;
6114 S_reghop3(U8 *s, I32 off, const U8* lim)
6118 PERL_ARGS_ASSERT_REGHOP3;
6121 while (off-- && s < lim) {
6122 /* XXX could check well-formedness here */
6127 while (off++ && s > lim) {
6129 if (UTF8_IS_CONTINUED(*s)) {
6130 while (s > lim && UTF8_IS_CONTINUATION(*s))
6133 /* XXX could check well-formedness here */
6140 /* there are a bunch of places where we use two reghop3's that should
6141 be replaced with this routine. but since thats not done yet
6142 we ifdef it out - dmq
6145 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6149 PERL_ARGS_ASSERT_REGHOP4;
6152 while (off-- && s < rlim) {
6153 /* XXX could check well-formedness here */
6158 while (off++ && s > llim) {
6160 if (UTF8_IS_CONTINUED(*s)) {
6161 while (s > llim && UTF8_IS_CONTINUATION(*s))
6164 /* XXX could check well-formedness here */
6172 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6176 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6179 while (off-- && s < lim) {
6180 /* XXX could check well-formedness here */
6187 while (off++ && s > lim) {
6189 if (UTF8_IS_CONTINUED(*s)) {
6190 while (s > lim && UTF8_IS_CONTINUATION(*s))
6193 /* XXX could check well-formedness here */
6202 restore_pos(pTHX_ void *arg)
6205 regexp * const rex = (regexp *)arg;
6206 if (PL_reg_eval_set) {
6207 if (PL_reg_oldsaved) {
6208 rex->subbeg = PL_reg_oldsaved;
6209 rex->sublen = PL_reg_oldsavedlen;
6210 #ifdef PERL_OLD_COPY_ON_WRITE
6211 rex->saved_copy = PL_nrs;
6213 RXp_MATCH_COPIED_on(rex);
6215 PL_reg_magic->mg_len = PL_reg_oldpos;
6216 PL_reg_eval_set = 0;
6217 PL_curpm = PL_reg_oldcurpm;
6222 S_to_utf8_substr(pTHX_ register regexp *prog)
6226 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6229 if (prog->substrs->data[i].substr
6230 && !prog->substrs->data[i].utf8_substr) {
6231 SV* const sv = newSVsv(prog->substrs->data[i].substr);
6232 prog->substrs->data[i].utf8_substr = sv;
6233 sv_utf8_upgrade(sv);
6234 if (SvVALID(prog->substrs->data[i].substr)) {
6235 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6236 if (flags & FBMcf_TAIL) {
6237 /* Trim the trailing \n that fbm_compile added last
6239 SvCUR_set(sv, SvCUR(sv) - 1);
6240 /* Whilst this makes the SV technically "invalid" (as its
6241 buffer is no longer followed by "\0") when fbm_compile()
6242 adds the "\n" back, a "\0" is restored. */
6244 fbm_compile(sv, flags);
6246 if (prog->substrs->data[i].substr == prog->check_substr)
6247 prog->check_utf8 = sv;
6253 S_to_byte_substr(pTHX_ register regexp *prog)
6258 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6261 if (prog->substrs->data[i].utf8_substr
6262 && !prog->substrs->data[i].substr) {
6263 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6264 if (sv_utf8_downgrade(sv, TRUE)) {
6265 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6267 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6268 if (flags & FBMcf_TAIL) {
6269 /* Trim the trailing \n that fbm_compile added last
6271 SvCUR_set(sv, SvCUR(sv) - 1);
6273 fbm_compile(sv, flags);
6279 prog->substrs->data[i].substr = sv;
6280 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6281 prog->check_substr = sv;
6288 * c-indentation-style: bsd
6290 * indent-tabs-mode: t
6293 * ex: set ts=8 sts=4 sw=4 noet: