5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
78 #ifdef PERL_IN_XSUB_RE
84 #define RF_tainted 1 /* tainted information used? */
85 #define RF_warned 2 /* warned about big count? */
87 #define RF_utf8 8 /* Pattern contains multibyte chars? */
89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
91 #define RS_init 1 /* eval environment created */
92 #define RS_set 2 /* replsv value is set */
98 /* Valid for non-utf8 strings only: avoids the reginclass call if there are no
99 * complications: i.e., if everything matchable is straight forward in the
101 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \
102 : ANYOF_BITMAP_TEST(p,*(c)))
108 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
109 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
111 #define HOPc(pos,off) \
112 (char *)(PL_reg_match_utf8 \
113 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
115 #define HOPBACKc(pos, off) \
116 (char*)(PL_reg_match_utf8\
117 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
118 : (pos - off >= PL_bostr) \
122 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
123 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
125 /* these are unrolled below in the CCC_TRY_XXX defined */
126 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
127 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
129 /* Doesn't do an assert to verify that is correct */
130 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
131 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
133 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
134 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
135 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
137 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
138 LOAD_UTF8_CHARCLASS(X_begin, " "); \
139 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
140 /* These are utf8 constants, and not utf-ebcdic constants, so the \
141 * assert should likely and hopefully fail on an EBCDIC machine */ \
142 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
144 /* No asserts are done for these, in case called on an early \
145 * Unicode version in which they map to nothing */ \
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
149 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
150 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
151 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
152 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
155 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
156 so that it is possible to override the option here without having to
157 rebuild the entire core. as we are required to do if we change regcomp.h
158 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
160 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
161 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
164 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
165 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
166 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
167 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
168 #define RE_utf8_perl_word PL_utf8_alnum
169 #define RE_utf8_perl_space PL_utf8_space
170 #define RE_utf8_posix_digit PL_utf8_digit
171 #define perl_word alnum
172 #define perl_space space
173 #define posix_digit digit
175 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
176 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
177 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
178 #define RE_utf8_perl_word PL_utf8_perl_word
179 #define RE_utf8_perl_space PL_utf8_perl_space
180 #define RE_utf8_posix_digit PL_utf8_posix_digit
184 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
186 PL_reg_flags |= RF_tainted; \
191 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
192 if (!CAT2(PL_utf8_,CLASS)) { \
196 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
200 if (!(OP(scan) == NAME \
201 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
202 : LCFUNC_utf8((U8*)locinput))) \
206 locinput += PL_utf8skip[nextchr]; \
207 nextchr = UCHARAT(locinput); \
210 /* Drops through to the macro that calls this one */
212 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
213 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
214 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
216 nextchr = UCHARAT(++locinput); \
219 /* Almost identical to the above, but has a case for a node that matches chars
220 * between 128 and 255 using Unicode (latin1) semantics. */
221 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
222 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
223 if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
225 nextchr = UCHARAT(++locinput); \
228 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
230 PL_reg_flags |= RF_tainted; \
233 if (!nextchr && locinput >= PL_regeol) \
235 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
236 if (!CAT2(PL_utf8_,CLASS)) { \
240 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
244 if ((OP(scan) == NAME \
245 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
246 : LCFUNC_utf8((U8*)locinput))) \
250 locinput += PL_utf8skip[nextchr]; \
251 nextchr = UCHARAT(locinput); \
255 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
256 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
257 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
259 nextchr = UCHARAT(++locinput); \
263 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
264 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \
265 if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
267 nextchr = UCHARAT(++locinput); \
272 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
274 /* for use after a quantifier and before an EXACT-like node -- japhy */
275 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
277 * NOTE that *nothing* that affects backtracking should be in here, specifically
278 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
279 * node that is in between two EXACT like nodes when ascertaining what the required
280 * "follow" character is. This should probably be moved to regex compile time
281 * although it may be done at run time beause of the REF possibility - more
282 * investigation required. -- demerphq
284 #define JUMPABLE(rn) ( \
286 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
288 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
289 OP(rn) == PLUS || OP(rn) == MINMOD || \
291 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
293 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
295 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
298 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
299 we don't need this definition. */
300 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
301 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
302 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
305 /* ... so we use this as its faster. */
306 #define IS_TEXT(rn) ( OP(rn)==EXACT )
307 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
308 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
313 Search for mandatory following text node; for lookahead, the text must
314 follow but for lookbehind (rn->flags != 0) we skip to the next step.
316 #define FIND_NEXT_IMPT(rn) STMT_START { \
317 while (JUMPABLE(rn)) { \
318 const OPCODE type = OP(rn); \
319 if (type == SUSPEND || PL_regkind[type] == CURLY) \
320 rn = NEXTOPER(NEXTOPER(rn)); \
321 else if (type == PLUS) \
323 else if (type == IFMATCH) \
324 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
325 else rn += NEXT_OFF(rn); \
330 static void restore_pos(pTHX_ void *arg);
332 #define REGCP_PAREN_ELEMS 4
333 #define REGCP_OTHER_ELEMS 5
334 #define REGCP_FRAME_ELEMS 1
335 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
336 * are needed for the regexp context stack bookkeeping. */
339 S_regcppush(pTHX_ I32 parenfloor)
342 const int retval = PL_savestack_ix;
343 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
344 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
345 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
347 GET_RE_DEBUG_FLAGS_DECL;
349 if (paren_elems_to_push < 0)
350 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
352 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
353 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
354 " out of range (%lu-%ld)",
355 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
357 SSGROW(total_elems + REGCP_FRAME_ELEMS);
359 for (p = PL_regsize; p > parenfloor; p--) {
360 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
361 SSPUSHINT(PL_regoffs[p].end);
362 SSPUSHINT(PL_regoffs[p].start);
363 SSPUSHPTR(PL_reg_start_tmp[p]);
365 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
366 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
367 (UV)p, (IV)PL_regoffs[p].start,
368 (IV)(PL_reg_start_tmp[p] - PL_bostr),
369 (IV)PL_regoffs[p].end
372 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
373 SSPUSHPTR(PL_regoffs);
374 SSPUSHINT(PL_regsize);
375 SSPUSHINT(*PL_reglastparen);
376 SSPUSHINT(*PL_reglastcloseparen);
377 SSPUSHPTR(PL_reginput);
378 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
383 /* These are needed since we do not localize EVAL nodes: */
384 #define REGCP_SET(cp) \
386 PerlIO_printf(Perl_debug_log, \
387 " Setting an EVAL scope, savestack=%"IVdf"\n", \
388 (IV)PL_savestack_ix)); \
391 #define REGCP_UNWIND(cp) \
393 if (cp != PL_savestack_ix) \
394 PerlIO_printf(Perl_debug_log, \
395 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
396 (IV)(cp), (IV)PL_savestack_ix)); \
400 S_regcppop(pTHX_ const regexp *rex)
405 GET_RE_DEBUG_FLAGS_DECL;
407 PERL_ARGS_ASSERT_REGCPPOP;
409 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
411 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
412 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
413 input = (char *) SSPOPPTR;
414 *PL_reglastcloseparen = SSPOPINT;
415 *PL_reglastparen = SSPOPINT;
416 PL_regsize = SSPOPINT;
417 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
419 i -= REGCP_OTHER_ELEMS;
420 /* Now restore the parentheses context. */
421 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
423 U32 paren = (U32)SSPOPINT;
424 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
425 PL_regoffs[paren].start = SSPOPINT;
427 if (paren <= *PL_reglastparen)
428 PL_regoffs[paren].end = tmps;
430 PerlIO_printf(Perl_debug_log,
431 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
432 (UV)paren, (IV)PL_regoffs[paren].start,
433 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
434 (IV)PL_regoffs[paren].end,
435 (paren > *PL_reglastparen ? "(no)" : ""));
439 if (*PL_reglastparen + 1 <= rex->nparens) {
440 PerlIO_printf(Perl_debug_log,
441 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
442 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
446 /* It would seem that the similar code in regtry()
447 * already takes care of this, and in fact it is in
448 * a better location to since this code can #if 0-ed out
449 * but the code in regtry() is needed or otherwise tests
450 * requiring null fields (pat.t#187 and split.t#{13,14}
451 * (as of patchlevel 7877) will fail. Then again,
452 * this code seems to be necessary or otherwise
453 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
454 * --jhi updated by dapm */
455 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
457 PL_regoffs[i].start = -1;
458 PL_regoffs[i].end = -1;
464 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
467 * pregexec and friends
470 #ifndef PERL_IN_XSUB_RE
472 - pregexec - match a regexp against a string
475 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
476 char *strbeg, I32 minend, SV *screamer, U32 nosave)
477 /* strend: pointer to null at end of string */
478 /* strbeg: real beginning of string */
479 /* minend: end of match must be >=minend after stringarg. */
480 /* nosave: For optimizations. */
482 PERL_ARGS_ASSERT_PREGEXEC;
485 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
486 nosave ? 0 : REXEC_COPY_STR);
491 * Need to implement the following flags for reg_anch:
493 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
495 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
496 * INTUIT_AUTORITATIVE_ML
497 * INTUIT_ONCE_NOML - Intuit can match in one location only.
500 * Another flag for this function: SECOND_TIME (so that float substrs
501 * with giant delta may be not rechecked).
504 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
506 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
507 Otherwise, only SvCUR(sv) is used to get strbeg. */
509 /* XXXX We assume that strpos is strbeg unless sv. */
511 /* XXXX Some places assume that there is a fixed substring.
512 An update may be needed if optimizer marks as "INTUITable"
513 RExen without fixed substrings. Similarly, it is assumed that
514 lengths of all the strings are no more than minlen, thus they
515 cannot come from lookahead.
516 (Or minlen should take into account lookahead.)
517 NOTE: Some of this comment is not correct. minlen does now take account
518 of lookahead/behind. Further research is required. -- demerphq
522 /* A failure to find a constant substring means that there is no need to make
523 an expensive call to REx engine, thus we celebrate a failure. Similarly,
524 finding a substring too deep into the string means that less calls to
525 regtry() should be needed.
527 REx compiler's optimizer found 4 possible hints:
528 a) Anchored substring;
530 c) Whether we are anchored (beginning-of-line or \G);
531 d) First node (of those at offset 0) which may distingush positions;
532 We use a)b)d) and multiline-part of c), and try to find a position in the
533 string which does not contradict any of them.
536 /* Most of decisions we do here should have been done at compile time.
537 The nodes of the REx which we used for the search should have been
538 deleted from the finite automaton. */
541 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
542 char *strend, const U32 flags, re_scream_pos_data *data)
545 struct regexp *const prog = (struct regexp *)SvANY(rx);
546 register I32 start_shift = 0;
547 /* Should be nonnegative! */
548 register I32 end_shift = 0;
553 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
555 register char *other_last = NULL; /* other substr checked before this */
556 char *check_at = NULL; /* check substr found at this pos */
557 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
558 RXi_GET_DECL(prog,progi);
560 const char * const i_strpos = strpos;
562 GET_RE_DEBUG_FLAGS_DECL;
564 PERL_ARGS_ASSERT_RE_INTUIT_START;
566 RX_MATCH_UTF8_set(rx,utf8_target);
569 PL_reg_flags |= RF_utf8;
572 debug_start_match(rx, utf8_target, strpos, strend,
573 sv ? "Guessing start of match in sv for"
574 : "Guessing start of match in string for");
577 /* CHR_DIST() would be more correct here but it makes things slow. */
578 if (prog->minlen > strend - strpos) {
579 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
580 "String too short... [re_intuit_start]\n"));
584 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
587 if (!prog->check_utf8 && prog->check_substr)
588 to_utf8_substr(prog);
589 check = prog->check_utf8;
591 if (!prog->check_substr && prog->check_utf8)
592 to_byte_substr(prog);
593 check = prog->check_substr;
595 if (check == &PL_sv_undef) {
596 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
597 "Non-utf8 string cannot match utf8 check string\n"));
600 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
601 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
602 || ( (prog->extflags & RXf_ANCH_BOL)
603 && !multiline ) ); /* Check after \n? */
606 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
607 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
608 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
610 && (strpos != strbeg)) {
611 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
614 if (prog->check_offset_min == prog->check_offset_max &&
615 !(prog->extflags & RXf_CANY_SEEN)) {
616 /* Substring at constant offset from beg-of-str... */
619 s = HOP3c(strpos, prog->check_offset_min, strend);
622 slen = SvCUR(check); /* >= 1 */
624 if ( strend - s > slen || strend - s < slen - 1
625 || (strend - s == slen && strend[-1] != '\n')) {
626 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
629 /* Now should match s[0..slen-2] */
631 if (slen && (*SvPVX_const(check) != *s
633 && memNE(SvPVX_const(check), s, slen)))) {
635 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
639 else if (*SvPVX_const(check) != *s
640 || ((slen = SvCUR(check)) > 1
641 && memNE(SvPVX_const(check), s, slen)))
644 goto success_at_start;
647 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
649 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
650 end_shift = prog->check_end_shift;
653 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
654 - (SvTAIL(check) != 0);
655 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
657 if (end_shift < eshift)
661 else { /* Can match at random position */
664 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
665 end_shift = prog->check_end_shift;
667 /* end shift should be non negative here */
670 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
672 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
673 (IV)end_shift, RX_PRECOMP(prog));
677 /* Find a possible match in the region s..strend by looking for
678 the "check" substring in the region corrected by start/end_shift. */
681 I32 srch_start_shift = start_shift;
682 I32 srch_end_shift = end_shift;
683 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
684 srch_end_shift -= ((strbeg - s) - srch_start_shift);
685 srch_start_shift = strbeg - s;
687 DEBUG_OPTIMISE_MORE_r({
688 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
689 (IV)prog->check_offset_min,
690 (IV)srch_start_shift,
692 (IV)prog->check_end_shift);
695 if (flags & REXEC_SCREAM) {
696 I32 p = -1; /* Internal iterator of scream. */
697 I32 * const pp = data ? data->scream_pos : &p;
699 if (PL_screamfirst[BmRARE(check)] >= 0
700 || ( BmRARE(check) == '\n'
701 && (BmPREVIOUS(check) == SvCUR(check) - 1)
703 s = screaminstr(sv, check,
704 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
707 /* we may be pointing at the wrong string */
708 if (s && RXp_MATCH_COPIED(prog))
709 s = strbeg + (s - SvPVX_const(sv));
711 *data->scream_olds = s;
716 if (prog->extflags & RXf_CANY_SEEN) {
717 start_point= (U8*)(s + srch_start_shift);
718 end_point= (U8*)(strend - srch_end_shift);
720 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
721 end_point= HOP3(strend, -srch_end_shift, strbeg);
723 DEBUG_OPTIMISE_MORE_r({
724 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
725 (int)(end_point - start_point),
726 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
730 s = fbm_instr( start_point, end_point,
731 check, multiline ? FBMrf_MULTILINE : 0);
734 /* Update the count-of-usability, remove useless subpatterns,
738 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
739 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
740 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
741 (s ? "Found" : "Did not find"),
742 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
743 ? "anchored" : "floating"),
746 (s ? " at offset " : "...\n") );
751 /* Finish the diagnostic message */
752 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
754 /* XXX dmq: first branch is for positive lookbehind...
755 Our check string is offset from the beginning of the pattern.
756 So we need to do any stclass tests offset forward from that
765 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
766 Start with the other substr.
767 XXXX no SCREAM optimization yet - and a very coarse implementation
768 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
769 *always* match. Probably should be marked during compile...
770 Probably it is right to do no SCREAM here...
773 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
774 : (prog->float_substr && prog->anchored_substr))
776 /* Take into account the "other" substring. */
777 /* XXXX May be hopelessly wrong for UTF... */
780 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
783 char * const last = HOP3c(s, -start_shift, strbeg);
785 char * const saved_s = s;
788 t = s - prog->check_offset_max;
789 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
791 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
796 t = HOP3c(t, prog->anchored_offset, strend);
797 if (t < other_last) /* These positions already checked */
799 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
802 /* XXXX It is not documented what units *_offsets are in.
803 We assume bytes, but this is clearly wrong.
804 Meaning this code needs to be carefully reviewed for errors.
808 /* On end-of-str: see comment below. */
809 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
810 if (must == &PL_sv_undef) {
812 DEBUG_r(must = prog->anchored_utf8); /* for debug */
817 HOP3(HOP3(last1, prog->anchored_offset, strend)
818 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
820 multiline ? FBMrf_MULTILINE : 0
823 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
824 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
825 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
826 (s ? "Found" : "Contradicts"),
827 quoted, RE_SV_TAIL(must));
832 if (last1 >= last2) {
833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
834 ", giving up...\n"));
837 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
838 ", trying floating at offset %ld...\n",
839 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
840 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
841 s = HOP3c(last, 1, strend);
845 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
846 (long)(s - i_strpos)));
847 t = HOP3c(s, -prog->anchored_offset, strbeg);
848 other_last = HOP3c(s, 1, strend);
856 else { /* Take into account the floating substring. */
858 char * const saved_s = s;
861 t = HOP3c(s, -start_shift, strbeg);
863 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
864 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
865 last = HOP3c(t, prog->float_max_offset, strend);
866 s = HOP3c(t, prog->float_min_offset, strend);
869 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
870 must = utf8_target ? prog->float_utf8 : prog->float_substr;
871 /* fbm_instr() takes into account exact value of end-of-str
872 if the check is SvTAIL(ed). Since false positives are OK,
873 and end-of-str is not later than strend we are OK. */
874 if (must == &PL_sv_undef) {
876 DEBUG_r(must = prog->float_utf8); /* for debug message */
879 s = fbm_instr((unsigned char*)s,
880 (unsigned char*)last + SvCUR(must)
882 must, multiline ? FBMrf_MULTILINE : 0);
884 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
885 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
886 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
887 (s ? "Found" : "Contradicts"),
888 quoted, RE_SV_TAIL(must));
892 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
893 ", giving up...\n"));
896 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
897 ", trying anchored starting at offset %ld...\n",
898 (long)(saved_s + 1 - i_strpos)));
900 s = HOP3c(t, 1, strend);
904 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
905 (long)(s - i_strpos)));
906 other_last = s; /* Fix this later. --Hugo */
916 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
918 DEBUG_OPTIMISE_MORE_r(
919 PerlIO_printf(Perl_debug_log,
920 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
921 (IV)prog->check_offset_min,
922 (IV)prog->check_offset_max,
930 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
932 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
935 /* Fixed substring is found far enough so that the match
936 cannot start at strpos. */
938 if (ml_anch && t[-1] != '\n') {
939 /* Eventually fbm_*() should handle this, but often
940 anchored_offset is not 0, so this check will not be wasted. */
941 /* XXXX In the code below we prefer to look for "^" even in
942 presence of anchored substrings. And we search even
943 beyond the found float position. These pessimizations
944 are historical artefacts only. */
946 while (t < strend - prog->minlen) {
948 if (t < check_at - prog->check_offset_min) {
949 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
950 /* Since we moved from the found position,
951 we definitely contradict the found anchored
952 substr. Due to the above check we do not
953 contradict "check" substr.
954 Thus we can arrive here only if check substr
955 is float. Redo checking for "other"=="fixed".
958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
959 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
960 goto do_other_anchored;
962 /* We don't contradict the found floating substring. */
963 /* XXXX Why not check for STCLASS? */
965 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
966 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
969 /* Position contradicts check-string */
970 /* XXXX probably better to look for check-string
971 than for "\n", so one should lower the limit for t? */
972 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
973 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
974 other_last = strpos = s = t + 1;
979 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
980 PL_colors[0], PL_colors[1]));
984 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
985 PL_colors[0], PL_colors[1]));
989 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
992 /* The found string does not prohibit matching at strpos,
993 - no optimization of calling REx engine can be performed,
994 unless it was an MBOL and we are not after MBOL,
995 or a future STCLASS check will fail this. */
997 /* Even in this situation we may use MBOL flag if strpos is offset
998 wrt the start of the string. */
999 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
1000 && (strpos != strbeg) && strpos[-1] != '\n'
1001 /* May be due to an implicit anchor of m{.*foo} */
1002 && !(prog->intflags & PREGf_IMPLICIT))
1007 DEBUG_EXECUTE_r( if (ml_anch)
1008 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1009 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1012 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1014 prog->check_utf8 /* Could be deleted already */
1015 && --BmUSEFUL(prog->check_utf8) < 0
1016 && (prog->check_utf8 == prog->float_utf8)
1018 prog->check_substr /* Could be deleted already */
1019 && --BmUSEFUL(prog->check_substr) < 0
1020 && (prog->check_substr == prog->float_substr)
1023 /* If flags & SOMETHING - do not do it many times on the same match */
1024 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1025 /* XXX Does the destruction order has to change with utf8_target? */
1026 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1027 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1028 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1029 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1030 check = NULL; /* abort */
1032 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1033 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1034 if (prog->intflags & PREGf_IMPLICIT)
1035 prog->extflags &= ~RXf_ANCH_MBOL;
1036 /* XXXX This is a remnant of the old implementation. It
1037 looks wasteful, since now INTUIT can use many
1038 other heuristics. */
1039 prog->extflags &= ~RXf_USE_INTUIT;
1040 /* XXXX What other flags might need to be cleared in this branch? */
1046 /* Last resort... */
1047 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1048 /* trie stclasses are too expensive to use here, we are better off to
1049 leave it to regmatch itself */
1050 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1051 /* minlen == 0 is possible if regstclass is \b or \B,
1052 and the fixed substr is ''$.
1053 Since minlen is already taken into account, s+1 is before strend;
1054 accidentally, minlen >= 1 guaranties no false positives at s + 1
1055 even for \b or \B. But (minlen? 1 : 0) below assumes that
1056 regstclass does not come from lookahead... */
1057 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1058 This leaves EXACTF only, which is dealt with in find_byclass(). */
1059 const U8* const str = (U8*)STRING(progi->regstclass);
1060 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1061 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1064 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1065 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1066 else if (prog->float_substr || prog->float_utf8)
1067 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1071 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1072 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1075 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1078 const char *what = NULL;
1080 if (endpos == strend) {
1081 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082 "Could not match STCLASS...\n") );
1085 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1086 "This position contradicts STCLASS...\n") );
1087 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1089 /* Contradict one of substrings */
1090 if (prog->anchored_substr || prog->anchored_utf8) {
1091 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1092 DEBUG_EXECUTE_r( what = "anchored" );
1094 s = HOP3c(t, 1, strend);
1095 if (s + start_shift + end_shift > strend) {
1096 /* XXXX Should be taken into account earlier? */
1097 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1098 "Could not match STCLASS...\n") );
1103 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1104 "Looking for %s substr starting at offset %ld...\n",
1105 what, (long)(s + start_shift - i_strpos)) );
1108 /* Have both, check_string is floating */
1109 if (t + start_shift >= check_at) /* Contradicts floating=check */
1110 goto retry_floating_check;
1111 /* Recheck anchored substring, but not floating... */
1115 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1116 "Looking for anchored substr starting at offset %ld...\n",
1117 (long)(other_last - i_strpos)) );
1118 goto do_other_anchored;
1120 /* Another way we could have checked stclass at the
1121 current position only: */
1126 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1127 "Looking for /%s^%s/m starting at offset %ld...\n",
1128 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1131 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1133 /* Check is floating subtring. */
1134 retry_floating_check:
1135 t = check_at - start_shift;
1136 DEBUG_EXECUTE_r( what = "floating" );
1137 goto hop_and_restart;
1140 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1141 "By STCLASS: moving %ld --> %ld\n",
1142 (long)(t - i_strpos), (long)(s - i_strpos))
1146 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1147 "Does not contradict STCLASS...\n");
1152 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1153 PL_colors[4], (check ? "Guessed" : "Giving up"),
1154 PL_colors[5], (long)(s - i_strpos)) );
1157 fail_finish: /* Substring not found */
1158 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1159 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1161 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1162 PL_colors[4], PL_colors[5]));
1166 #define DECL_TRIE_TYPE(scan) \
1167 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1168 trie_type = (scan->flags != EXACT) \
1169 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1170 : (utf8_target ? trie_utf8 : trie_plain)
1172 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1173 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1174 switch (trie_type) { \
1175 case trie_utf8_fold: \
1176 if ( foldlen>0 ) { \
1177 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1182 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1183 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1184 foldlen -= UNISKIP( uvc ); \
1185 uscan = foldbuf + UNISKIP( uvc ); \
1188 case trie_latin_utf8_fold: \
1189 if ( foldlen>0 ) { \
1190 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1196 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1197 foldlen -= UNISKIP( uvc ); \
1198 uscan = foldbuf + UNISKIP( uvc ); \
1202 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1209 charid = trie->charmap[ uvc ]; \
1213 if (widecharmap) { \
1214 SV** const svpp = hv_fetch(widecharmap, \
1215 (char*)&uvc, sizeof(UV), 0); \
1217 charid = (U16)SvIV(*svpp); \
1222 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1224 char *my_strend= (char *)strend; \
1227 foldEQ_utf8(s, &my_strend, 0, utf8_target, \
1228 m, NULL, ln, cBOOL(UTF_PATTERN))) \
1229 && (!reginfo || regtry(reginfo, &s)) ) \
1232 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1233 uvchr_to_utf8(tmpbuf, c); \
1234 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1236 && (f == c1 || f == c2) \
1238 foldEQ_utf8(s, &my_strend, 0, utf8_target,\
1239 m, NULL, ln, cBOOL(UTF_PATTERN)))\
1240 && (!reginfo || regtry(reginfo, &s)) ) \
1246 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1250 && (ln == 1 || (OP(c) == EXACTF \
1251 ? foldEQ(s, m, ln) \
1252 : foldEQ_locale(s, m, ln))) \
1253 && (!reginfo || regtry(reginfo, &s)) ) \
1259 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1261 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1267 #define REXEC_FBC_SCAN(CoDe) \
1269 while (s < strend) { \
1275 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1276 REXEC_FBC_UTF8_SCAN( \
1278 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1287 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1290 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1299 #define REXEC_FBC_TRYIT \
1300 if ((!reginfo || regtry(reginfo, &s))) \
1303 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1304 if (utf8_target) { \
1305 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1308 REXEC_FBC_CLASS_SCAN(CoNd); \
1312 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1313 if (utf8_target) { \
1315 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1318 REXEC_FBC_CLASS_SCAN(CoNd); \
1322 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1323 PL_reg_flags |= RF_tainted; \
1324 if (utf8_target) { \
1325 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1328 REXEC_FBC_CLASS_SCAN(CoNd); \
1332 #define DUMP_EXEC_POS(li,s,doutf8) \
1333 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1335 /* We know what class REx starts with. Try to find this position... */
1336 /* if reginfo is NULL, its a dryrun */
1337 /* annoyingly all the vars in this routine have different names from their counterparts
1338 in regmatch. /grrr */
1341 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1342 const char *strend, regmatch_info *reginfo)
1345 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1349 register STRLEN uskip;
1353 register I32 tmp = 1; /* Scratch variable? */
1354 register const bool utf8_target = PL_reg_match_utf8;
1355 RXi_GET_DECL(prog,progi);
1357 PERL_ARGS_ASSERT_FIND_BYCLASS;
1359 /* We know what class it must start with. */
1363 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1364 !UTF8_IS_INVARIANT((U8)s[0]) ?
1365 reginclass(prog, c, (U8*)s, 0, utf8_target) :
1366 REGINCLASS(prog, c, (U8*)s));
1369 while (s < strend) {
1372 if (REGINCLASS(prog, c, (U8*)s) ||
1373 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1374 /* The assignment of 2 is intentional:
1375 * for the folded sharp s, the skip is 2. */
1376 (skip = SHARP_S_SKIP))) {
1377 if (tmp && (!reginfo || regtry(reginfo, &s)))
1390 if (tmp && (!reginfo || regtry(reginfo, &s)))
1398 ln = STR_LEN(c); /* length to match in octets/bytes */
1399 lnc = (I32) ln; /* length to match in characters */
1401 STRLEN ulen1, ulen2;
1403 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1404 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1405 /* used by commented-out code below */
1406 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1408 /* XXX: Since the node will be case folded at compile
1409 time this logic is a little odd, although im not
1410 sure that its actually wrong. --dmq */
1412 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1413 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1415 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1416 codepoint of the first character in the converted
1417 form, yet originally we did the extra step.
1418 No tests fail by commenting this code out however
1419 so Ive left it out. -- dmq.
1421 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1423 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1428 while (sm < ((U8 *) m + ln)) {
1443 c2 = PL_fold_locale[c1];
1445 e = HOP3c(strend, -((I32)lnc), s);
1447 if (!reginfo && e < s)
1448 e = s; /* Due to minlen logic of intuit() */
1450 /* The idea in the EXACTF* cases is to first find the
1451 * first character of the EXACTF* node and then, if
1452 * necessary, case-insensitively compare the full
1453 * text of the node. The c1 and c2 are the first
1454 * characters (though in Unicode it gets a bit
1455 * more complicated because there are more cases
1456 * than just upper and lower: one needs to use
1457 * the so-called folding case for case-insensitive
1458 * matching (called "loose matching" in Unicode).
1459 * foldEQ_utf8() will do just that. */
1461 if (utf8_target || UTF_PATTERN) {
1463 U8 tmpbuf [UTF8_MAXBYTES+1];
1466 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1468 /* Upper and lower of 1st char are equal -
1469 * probably not a "letter". */
1472 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1477 REXEC_FBC_EXACTISH_CHECK(c == c1);
1483 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1489 /* Handle some of the three Greek sigmas cases.
1490 * Note that not all the possible combinations
1491 * are handled here: some of them are handled
1492 * by the standard folding rules, and some of
1493 * them (the character class or ANYOF cases)
1494 * are handled during compiletime in
1495 * regexec.c:S_regclass(). */
1496 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1497 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1498 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1500 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1505 /* Neither pattern nor string are UTF8 */
1507 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1509 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1513 PL_reg_flags |= RF_tainted;
1520 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1521 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1523 tmp = ((OP(c) == BOUND ?
1524 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1525 LOAD_UTF8_CHARCLASS_ALNUM();
1526 REXEC_FBC_UTF8_SCAN(
1527 if (tmp == !(OP(c) == BOUND ?
1528 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1529 isALNUM_LC_utf8((U8*)s)))
1536 else { /* Not utf8 */
1537 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1538 tmp = cBOOL((OP(c) == BOUNDL)
1540 : (isWORDCHAR_L1(tmp)
1541 && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1546 : (isWORDCHAR_L1((U8) *s)
1547 && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1554 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1558 PL_reg_flags |= RF_tainted;
1565 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1566 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1568 tmp = ((OP(c) == NBOUND ?
1569 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1570 LOAD_UTF8_CHARCLASS_ALNUM();
1571 REXEC_FBC_UTF8_SCAN(
1572 if (tmp == !(OP(c) == NBOUND ?
1573 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1574 isALNUM_LC_utf8((U8*)s)))
1576 else REXEC_FBC_TRYIT;
1580 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1581 tmp = cBOOL((OP(c) == NBOUNDL)
1583 : (isWORDCHAR_L1(tmp)
1584 && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1589 : (isWORDCHAR_L1((U8) *s)
1590 && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1594 else REXEC_FBC_TRYIT;
1597 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1601 REXEC_FBC_CSCAN_PRELOAD(
1602 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1603 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1604 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1607 REXEC_FBC_CSCAN_TAINT(
1608 isALNUM_LC_utf8((U8*)s),
1612 REXEC_FBC_CSCAN_PRELOAD(
1613 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1614 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1615 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1618 REXEC_FBC_CSCAN_TAINT(
1619 !isALNUM_LC_utf8((U8*)s),
1623 REXEC_FBC_CSCAN_PRELOAD(
1624 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1625 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1626 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1629 REXEC_FBC_CSCAN_TAINT(
1630 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1634 REXEC_FBC_CSCAN_PRELOAD(
1635 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1636 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1637 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1640 REXEC_FBC_CSCAN_TAINT(
1641 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1645 REXEC_FBC_CSCAN_PRELOAD(
1646 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1647 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1651 REXEC_FBC_CSCAN_TAINT(
1652 isDIGIT_LC_utf8((U8*)s),
1656 REXEC_FBC_CSCAN_PRELOAD(
1657 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1658 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1662 REXEC_FBC_CSCAN_TAINT(
1663 !isDIGIT_LC_utf8((U8*)s),
1669 is_LNBREAK_latin1(s)
1679 !is_VERTWS_latin1(s)
1684 is_HORIZWS_latin1(s)
1688 !is_HORIZWS_utf8(s),
1689 !is_HORIZWS_latin1(s)
1695 /* what trie are we using right now */
1697 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1699 = (reg_trie_data*)progi->data->data[ aho->trie ];
1700 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1702 const char *last_start = strend - trie->minlen;
1704 const char *real_start = s;
1706 STRLEN maxlen = trie->maxlen;
1708 U8 **points; /* map of where we were in the input string
1709 when reading a given char. For ASCII this
1710 is unnecessary overhead as the relationship
1711 is always 1:1, but for Unicode, especially
1712 case folded Unicode this is not true. */
1713 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1717 GET_RE_DEBUG_FLAGS_DECL;
1719 /* We can't just allocate points here. We need to wrap it in
1720 * an SV so it gets freed properly if there is a croak while
1721 * running the match */
1724 sv_points=newSV(maxlen * sizeof(U8 *));
1725 SvCUR_set(sv_points,
1726 maxlen * sizeof(U8 *));
1727 SvPOK_on(sv_points);
1728 sv_2mortal(sv_points);
1729 points=(U8**)SvPV_nolen(sv_points );
1730 if ( trie_type != trie_utf8_fold
1731 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1734 bitmap=(U8*)trie->bitmap;
1736 bitmap=(U8*)ANYOF_BITMAP(c);
1738 /* this is the Aho-Corasick algorithm modified a touch
1739 to include special handling for long "unknown char"
1740 sequences. The basic idea being that we use AC as long
1741 as we are dealing with a possible matching char, when
1742 we encounter an unknown char (and we have not encountered
1743 an accepting state) we scan forward until we find a legal
1745 AC matching is basically that of trie matching, except
1746 that when we encounter a failing transition, we fall back
1747 to the current states "fail state", and try the current char
1748 again, a process we repeat until we reach the root state,
1749 state 1, or a legal transition. If we fail on the root state
1750 then we can either terminate if we have reached an accepting
1751 state previously, or restart the entire process from the beginning
1755 while (s <= last_start) {
1756 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1764 U8 *uscan = (U8*)NULL;
1765 U8 *leftmost = NULL;
1767 U32 accepted_word= 0;
1771 while ( state && uc <= (U8*)strend ) {
1773 U32 word = aho->states[ state ].wordnum;
1777 DEBUG_TRIE_EXECUTE_r(
1778 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1779 dump_exec_pos( (char *)uc, c, strend, real_start,
1780 (char *)uc, utf8_target );
1781 PerlIO_printf( Perl_debug_log,
1782 " Scanning for legal start char...\n");
1786 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1790 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1796 if (uc >(U8*)last_start) break;
1800 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1801 if (!leftmost || lpos < leftmost) {
1802 DEBUG_r(accepted_word=word);
1808 points[pointpos++ % maxlen]= uc;
1809 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1810 uscan, len, uvc, charid, foldlen,
1812 DEBUG_TRIE_EXECUTE_r({
1813 dump_exec_pos( (char *)uc, c, strend, real_start,
1815 PerlIO_printf(Perl_debug_log,
1816 " Charid:%3u CP:%4"UVxf" ",
1822 word = aho->states[ state ].wordnum;
1824 base = aho->states[ state ].trans.base;
1826 DEBUG_TRIE_EXECUTE_r({
1828 dump_exec_pos( (char *)uc, c, strend, real_start,
1830 PerlIO_printf( Perl_debug_log,
1831 "%sState: %4"UVxf", word=%"UVxf,
1832 failed ? " Fail transition to " : "",
1833 (UV)state, (UV)word);
1839 ( ((offset = base + charid
1840 - 1 - trie->uniquecharcount)) >= 0)
1841 && ((U32)offset < trie->lasttrans)
1842 && trie->trans[offset].check == state
1843 && (tmp=trie->trans[offset].next))
1845 DEBUG_TRIE_EXECUTE_r(
1846 PerlIO_printf( Perl_debug_log," - legal\n"));
1851 DEBUG_TRIE_EXECUTE_r(
1852 PerlIO_printf( Perl_debug_log," - fail\n"));
1854 state = aho->fail[state];
1858 /* we must be accepting here */
1859 DEBUG_TRIE_EXECUTE_r(
1860 PerlIO_printf( Perl_debug_log," - accepting\n"));
1869 if (!state) state = 1;
1872 if ( aho->states[ state ].wordnum ) {
1873 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1874 if (!leftmost || lpos < leftmost) {
1875 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1880 s = (char*)leftmost;
1881 DEBUG_TRIE_EXECUTE_r({
1883 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1884 (UV)accepted_word, (IV)(s - real_start)
1887 if (!reginfo || regtry(reginfo, &s)) {
1893 DEBUG_TRIE_EXECUTE_r({
1894 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1897 DEBUG_TRIE_EXECUTE_r(
1898 PerlIO_printf( Perl_debug_log,"No match.\n"));
1907 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1917 - regexec_flags - match a regexp against a string
1920 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1921 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1922 /* strend: pointer to null at end of string */
1923 /* strbeg: real beginning of string */
1924 /* minend: end of match must be >=minend after stringarg. */
1925 /* data: May be used for some additional optimizations.
1926 Currently its only used, with a U32 cast, for transmitting
1927 the ganch offset when doing a /g match. This will change */
1928 /* nosave: For optimizations. */
1931 struct regexp *const prog = (struct regexp *)SvANY(rx);
1932 /*register*/ char *s;
1933 register regnode *c;
1934 /*register*/ char *startpos = stringarg;
1935 I32 minlen; /* must match at least this many chars */
1936 I32 dontbother = 0; /* how many characters not to try at end */
1937 I32 end_shift = 0; /* Same for the end. */ /* CC */
1938 I32 scream_pos = -1; /* Internal iterator of scream. */
1939 char *scream_olds = NULL;
1940 const bool utf8_target = cBOOL(DO_UTF8(sv));
1942 RXi_GET_DECL(prog,progi);
1943 regmatch_info reginfo; /* create some info to pass to regtry etc */
1944 regexp_paren_pair *swap = NULL;
1945 GET_RE_DEBUG_FLAGS_DECL;
1947 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1948 PERL_UNUSED_ARG(data);
1950 /* Be paranoid... */
1951 if (prog == NULL || startpos == NULL) {
1952 Perl_croak(aTHX_ "NULL regexp parameter");
1956 multiline = prog->extflags & RXf_PMf_MULTILINE;
1957 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1959 RX_MATCH_UTF8_set(rx, utf8_target);
1961 debug_start_match(rx, utf8_target, startpos, strend,
1965 minlen = prog->minlen;
1967 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1968 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1969 "String too short [regexec_flags]...\n"));
1974 /* Check validity of program. */
1975 if (UCHARAT(progi->program) != REG_MAGIC) {
1976 Perl_croak(aTHX_ "corrupted regexp program");
1980 PL_reg_eval_set = 0;
1984 PL_reg_flags |= RF_utf8;
1986 /* Mark beginning of line for ^ and lookbehind. */
1987 reginfo.bol = startpos; /* XXX not used ??? */
1991 /* Mark end of line for $ (and such) */
1994 /* see how far we have to get to not match where we matched before */
1995 reginfo.till = startpos+minend;
1997 /* If there is a "must appear" string, look for it. */
2000 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
2002 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
2003 reginfo.ganch = startpos + prog->gofs;
2004 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2005 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
2006 } else if (sv && SvTYPE(sv) >= SVt_PVMG
2008 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
2009 && mg->mg_len >= 0) {
2010 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2011 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2012 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2014 if (prog->extflags & RXf_ANCH_GPOS) {
2015 if (s > reginfo.ganch)
2017 s = reginfo.ganch - prog->gofs;
2018 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2019 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2025 reginfo.ganch = strbeg + PTR2UV(data);
2026 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2027 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2029 } else { /* pos() not defined */
2030 reginfo.ganch = strbeg;
2031 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2032 "GPOS: reginfo.ganch = strbeg\n"));
2035 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2036 /* We have to be careful. If the previous successful match
2037 was from this regex we don't want a subsequent partially
2038 successful match to clobber the old results.
2039 So when we detect this possibility we add a swap buffer
2040 to the re, and switch the buffer each match. If we fail
2041 we switch it back, otherwise we leave it swapped.
2044 /* do we need a save destructor here for eval dies? */
2045 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2047 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2048 re_scream_pos_data d;
2050 d.scream_olds = &scream_olds;
2051 d.scream_pos = &scream_pos;
2052 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2054 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2055 goto phooey; /* not present */
2061 /* Simplest case: anchored match need be tried only once. */
2062 /* [unless only anchor is BOL and multiline is set] */
2063 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2064 if (s == startpos && regtry(®info, &startpos))
2066 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2067 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2072 dontbother = minlen - 1;
2073 end = HOP3c(strend, -dontbother, strbeg) - 1;
2074 /* for multiline we only have to try after newlines */
2075 if (prog->check_substr || prog->check_utf8) {
2076 /* because of the goto we can not easily reuse the macros for bifurcating the
2077 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2080 goto after_try_utf8;
2082 if (regtry(®info, &s)) {
2089 if (prog->extflags & RXf_USE_INTUIT) {
2090 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2099 } /* end search for check string in unicode */
2101 if (s == startpos) {
2102 goto after_try_latin;
2105 if (regtry(®info, &s)) {
2112 if (prog->extflags & RXf_USE_INTUIT) {
2113 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2122 } /* end search for check string in latin*/
2123 } /* end search for check string */
2124 else { /* search for newline */
2126 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2129 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2131 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2132 if (regtry(®info, &s))
2136 } /* end search for newline */
2137 } /* end anchored/multiline check string search */
2139 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2141 /* the warning about reginfo.ganch being used without intialization
2142 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2143 and we only enter this block when the same bit is set. */
2144 char *tmp_s = reginfo.ganch - prog->gofs;
2146 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2151 /* Messy cases: unanchored match. */
2152 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2153 /* we have /x+whatever/ */
2154 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2159 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2160 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2161 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2166 DEBUG_EXECUTE_r( did_match = 1 );
2167 if (regtry(®info, &s)) goto got_it;
2169 while (s < strend && *s == ch)
2177 DEBUG_EXECUTE_r( did_match = 1 );
2178 if (regtry(®info, &s)) goto got_it;
2180 while (s < strend && *s == ch)
2185 DEBUG_EXECUTE_r(if (!did_match)
2186 PerlIO_printf(Perl_debug_log,
2187 "Did not find anchored character...\n")
2190 else if (prog->anchored_substr != NULL
2191 || prog->anchored_utf8 != NULL
2192 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2193 && prog->float_max_offset < strend - s)) {
2198 char *last1; /* Last position checked before */
2202 if (prog->anchored_substr || prog->anchored_utf8) {
2203 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2204 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2205 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2206 back_max = back_min = prog->anchored_offset;
2208 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2209 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2210 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2211 back_max = prog->float_max_offset;
2212 back_min = prog->float_min_offset;
2216 if (must == &PL_sv_undef)
2217 /* could not downgrade utf8 check substring, so must fail */
2223 last = HOP3c(strend, /* Cannot start after this */
2224 -(I32)(CHR_SVLEN(must)
2225 - (SvTAIL(must) != 0) + back_min), strbeg);
2228 last1 = HOPc(s, -1);
2230 last1 = s - 1; /* bogus */
2232 /* XXXX check_substr already used to find "s", can optimize if
2233 check_substr==must. */
2235 dontbother = end_shift;
2236 strend = HOPc(strend, -dontbother);
2237 while ( (s <= last) &&
2238 ((flags & REXEC_SCREAM)
2239 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2240 end_shift, &scream_pos, 0))
2241 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2242 (unsigned char*)strend, must,
2243 multiline ? FBMrf_MULTILINE : 0))) ) {
2244 /* we may be pointing at the wrong string */
2245 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2246 s = strbeg + (s - SvPVX_const(sv));
2247 DEBUG_EXECUTE_r( did_match = 1 );
2248 if (HOPc(s, -back_max) > last1) {
2249 last1 = HOPc(s, -back_min);
2250 s = HOPc(s, -back_max);
2253 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2255 last1 = HOPc(s, -back_min);
2259 while (s <= last1) {
2260 if (regtry(®info, &s))
2266 while (s <= last1) {
2267 if (regtry(®info, &s))
2273 DEBUG_EXECUTE_r(if (!did_match) {
2274 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2275 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2276 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2277 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2278 ? "anchored" : "floating"),
2279 quoted, RE_SV_TAIL(must));
2283 else if ( (c = progi->regstclass) ) {
2285 const OPCODE op = OP(progi->regstclass);
2286 /* don't bother with what can't match */
2287 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2288 strend = HOPc(strend, -(minlen - 1));
2291 SV * const prop = sv_newmortal();
2292 regprop(prog, prop, c);
2294 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2296 PerlIO_printf(Perl_debug_log,
2297 "Matching stclass %.*s against %s (%d bytes)\n",
2298 (int)SvCUR(prop), SvPVX_const(prop),
2299 quoted, (int)(strend - s));
2302 if (find_byclass(prog, c, s, strend, ®info))
2304 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2308 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2313 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2314 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2315 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2317 if (flags & REXEC_SCREAM) {
2318 last = screaminstr(sv, float_real, s - strbeg,
2319 end_shift, &scream_pos, 1); /* last one */
2321 last = scream_olds; /* Only one occurrence. */
2322 /* we may be pointing at the wrong string */
2323 else if (RXp_MATCH_COPIED(prog))
2324 s = strbeg + (s - SvPVX_const(sv));
2328 const char * const little = SvPV_const(float_real, len);
2330 if (SvTAIL(float_real)) {
2331 if (memEQ(strend - len + 1, little, len - 1))
2332 last = strend - len + 1;
2333 else if (!multiline)
2334 last = memEQ(strend - len, little, len)
2335 ? strend - len : NULL;
2341 last = rninstr(s, strend, little, little + len);
2343 last = strend; /* matching "$" */
2348 PerlIO_printf(Perl_debug_log,
2349 "%sCan't trim the tail, match fails (should not happen)%s\n",
2350 PL_colors[4], PL_colors[5]));
2351 goto phooey; /* Should not happen! */
2353 dontbother = strend - last + prog->float_min_offset;
2355 if (minlen && (dontbother < minlen))
2356 dontbother = minlen - 1;
2357 strend -= dontbother; /* this one's always in bytes! */
2358 /* We don't know much -- general case. */
2361 if (regtry(®info, &s))
2370 if (regtry(®info, &s))
2372 } while (s++ < strend);
2381 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2383 if (PL_reg_eval_set)
2384 restore_pos(aTHX_ prog);
2385 if (RXp_PAREN_NAMES(prog))
2386 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2388 /* make sure $`, $&, $', and $digit will work later */
2389 if ( !(flags & REXEC_NOT_FIRST) ) {
2390 RX_MATCH_COPY_FREE(rx);
2391 if (flags & REXEC_COPY_STR) {
2392 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2393 #ifdef PERL_OLD_COPY_ON_WRITE
2395 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2397 PerlIO_printf(Perl_debug_log,
2398 "Copy on write: regexp capture, type %d\n",
2401 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2402 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2403 assert (SvPOKp(prog->saved_copy));
2407 RX_MATCH_COPIED_on(rx);
2408 s = savepvn(strbeg, i);
2414 prog->subbeg = strbeg;
2415 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2422 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2423 PL_colors[4], PL_colors[5]));
2424 if (PL_reg_eval_set)
2425 restore_pos(aTHX_ prog);
2427 /* we failed :-( roll it back */
2428 Safefree(prog->offs);
2437 - regtry - try match at specific point
2439 STATIC I32 /* 0 failure, 1 success */
2440 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2444 REGEXP *const rx = reginfo->prog;
2445 regexp *const prog = (struct regexp *)SvANY(rx);
2446 RXi_GET_DECL(prog,progi);
2447 GET_RE_DEBUG_FLAGS_DECL;
2449 PERL_ARGS_ASSERT_REGTRY;
2451 reginfo->cutpoint=NULL;
2453 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2456 PL_reg_eval_set = RS_init;
2457 DEBUG_EXECUTE_r(DEBUG_s(
2458 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2459 (IV)(PL_stack_sp - PL_stack_base));
2462 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2463 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2465 /* Apparently this is not needed, judging by wantarray. */
2466 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2467 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2470 /* Make $_ available to executed code. */
2471 if (reginfo->sv != DEFSV) {
2473 DEFSV_set(reginfo->sv);
2476 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2477 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2478 /* prepare for quick setting of pos */
2479 #ifdef PERL_OLD_COPY_ON_WRITE
2480 if (SvIsCOW(reginfo->sv))
2481 sv_force_normal_flags(reginfo->sv, 0);
2483 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2484 &PL_vtbl_mglob, NULL, 0);
2488 PL_reg_oldpos = mg->mg_len;
2489 SAVEDESTRUCTOR_X(restore_pos, prog);
2491 if (!PL_reg_curpm) {
2492 Newxz(PL_reg_curpm, 1, PMOP);
2495 SV* const repointer = &PL_sv_undef;
2496 /* this regexp is also owned by the new PL_reg_curpm, which
2497 will try to free it. */
2498 av_push(PL_regex_padav, repointer);
2499 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2500 PL_regex_pad = AvARRAY(PL_regex_padav);
2505 /* It seems that non-ithreads works both with and without this code.
2506 So for efficiency reasons it seems best not to have the code
2507 compiled when it is not needed. */
2508 /* This is safe against NULLs: */
2509 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2510 /* PM_reg_curpm owns a reference to this regexp. */
2513 PM_SETRE(PL_reg_curpm, rx);
2514 PL_reg_oldcurpm = PL_curpm;
2515 PL_curpm = PL_reg_curpm;
2516 if (RXp_MATCH_COPIED(prog)) {
2517 /* Here is a serious problem: we cannot rewrite subbeg,
2518 since it may be needed if this match fails. Thus
2519 $` inside (?{}) could fail... */
2520 PL_reg_oldsaved = prog->subbeg;
2521 PL_reg_oldsavedlen = prog->sublen;
2522 #ifdef PERL_OLD_COPY_ON_WRITE
2523 PL_nrs = prog->saved_copy;
2525 RXp_MATCH_COPIED_off(prog);
2528 PL_reg_oldsaved = NULL;
2529 prog->subbeg = PL_bostr;
2530 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2532 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2533 prog->offs[0].start = *startpos - PL_bostr;
2534 PL_reginput = *startpos;
2535 PL_reglastparen = &prog->lastparen;
2536 PL_reglastcloseparen = &prog->lastcloseparen;
2537 prog->lastparen = 0;
2538 prog->lastcloseparen = 0;
2540 PL_regoffs = prog->offs;
2541 if (PL_reg_start_tmpl <= prog->nparens) {
2542 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2543 if(PL_reg_start_tmp)
2544 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2546 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2549 /* XXXX What this code is doing here?!!! There should be no need
2550 to do this again and again, PL_reglastparen should take care of
2553 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2554 * Actually, the code in regcppop() (which Ilya may be meaning by
2555 * PL_reglastparen), is not needed at all by the test suite
2556 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2557 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2558 * Meanwhile, this code *is* needed for the
2559 * above-mentioned test suite tests to succeed. The common theme
2560 * on those tests seems to be returning null fields from matches.
2561 * --jhi updated by dapm */
2563 if (prog->nparens) {
2564 regexp_paren_pair *pp = PL_regoffs;
2566 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2574 if (regmatch(reginfo, progi->program + 1)) {
2575 PL_regoffs[0].end = PL_reginput - PL_bostr;
2578 if (reginfo->cutpoint)
2579 *startpos= reginfo->cutpoint;
2580 REGCP_UNWIND(lastcp);
2585 #define sayYES goto yes
2586 #define sayNO goto no
2587 #define sayNO_SILENT goto no_silent
2589 /* we dont use STMT_START/END here because it leads to
2590 "unreachable code" warnings, which are bogus, but distracting. */
2591 #define CACHEsayNO \
2592 if (ST.cache_mask) \
2593 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2596 /* this is used to determine how far from the left messages like
2597 'failed...' are printed. It should be set such that messages
2598 are inline with the regop output that created them.
2600 #define REPORT_CODE_OFF 32
2603 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2604 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2606 #define SLAB_FIRST(s) (&(s)->states[0])
2607 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2609 /* grab a new slab and return the first slot in it */
2611 STATIC regmatch_state *
2614 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2617 regmatch_slab *s = PL_regmatch_slab->next;
2619 Newx(s, 1, regmatch_slab);
2620 s->prev = PL_regmatch_slab;
2622 PL_regmatch_slab->next = s;
2624 PL_regmatch_slab = s;
2625 return SLAB_FIRST(s);
2629 /* push a new state then goto it */
2631 #define PUSH_STATE_GOTO(state, node) \
2633 st->resume_state = state; \
2636 /* push a new state with success backtracking, then goto it */
2638 #define PUSH_YES_STATE_GOTO(state, node) \
2640 st->resume_state = state; \
2641 goto push_yes_state;
2647 regmatch() - main matching routine
2649 This is basically one big switch statement in a loop. We execute an op,
2650 set 'next' to point the next op, and continue. If we come to a point which
2651 we may need to backtrack to on failure such as (A|B|C), we push a
2652 backtrack state onto the backtrack stack. On failure, we pop the top
2653 state, and re-enter the loop at the state indicated. If there are no more
2654 states to pop, we return failure.
2656 Sometimes we also need to backtrack on success; for example /A+/, where
2657 after successfully matching one A, we need to go back and try to
2658 match another one; similarly for lookahead assertions: if the assertion
2659 completes successfully, we backtrack to the state just before the assertion
2660 and then carry on. In these cases, the pushed state is marked as
2661 'backtrack on success too'. This marking is in fact done by a chain of
2662 pointers, each pointing to the previous 'yes' state. On success, we pop to
2663 the nearest yes state, discarding any intermediate failure-only states.
2664 Sometimes a yes state is pushed just to force some cleanup code to be
2665 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2666 it to free the inner regex.
2668 Note that failure backtracking rewinds the cursor position, while
2669 success backtracking leaves it alone.
2671 A pattern is complete when the END op is executed, while a subpattern
2672 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2673 ops trigger the "pop to last yes state if any, otherwise return true"
2676 A common convention in this function is to use A and B to refer to the two
2677 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2678 the subpattern to be matched possibly multiple times, while B is the entire
2679 rest of the pattern. Variable and state names reflect this convention.
2681 The states in the main switch are the union of ops and failure/success of
2682 substates associated with with that op. For example, IFMATCH is the op
2683 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2684 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2685 successfully matched A and IFMATCH_A_fail is a state saying that we have
2686 just failed to match A. Resume states always come in pairs. The backtrack
2687 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2688 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2689 on success or failure.
2691 The struct that holds a backtracking state is actually a big union, with
2692 one variant for each major type of op. The variable st points to the
2693 top-most backtrack struct. To make the code clearer, within each
2694 block of code we #define ST to alias the relevant union.
2696 Here's a concrete example of a (vastly oversimplified) IFMATCH
2702 #define ST st->u.ifmatch
2704 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2705 ST.foo = ...; // some state we wish to save
2707 // push a yes backtrack state with a resume value of
2708 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2710 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2713 case IFMATCH_A: // we have successfully executed A; now continue with B
2715 bar = ST.foo; // do something with the preserved value
2718 case IFMATCH_A_fail: // A failed, so the assertion failed
2719 ...; // do some housekeeping, then ...
2720 sayNO; // propagate the failure
2727 For any old-timers reading this who are familiar with the old recursive
2728 approach, the code above is equivalent to:
2730 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2739 ...; // do some housekeeping, then ...
2740 sayNO; // propagate the failure
2743 The topmost backtrack state, pointed to by st, is usually free. If you
2744 want to claim it, populate any ST.foo fields in it with values you wish to
2745 save, then do one of
2747 PUSH_STATE_GOTO(resume_state, node);
2748 PUSH_YES_STATE_GOTO(resume_state, node);
2750 which sets that backtrack state's resume value to 'resume_state', pushes a
2751 new free entry to the top of the backtrack stack, then goes to 'node'.
2752 On backtracking, the free slot is popped, and the saved state becomes the
2753 new free state. An ST.foo field in this new top state can be temporarily
2754 accessed to retrieve values, but once the main loop is re-entered, it
2755 becomes available for reuse.
2757 Note that the depth of the backtrack stack constantly increases during the
2758 left-to-right execution of the pattern, rather than going up and down with
2759 the pattern nesting. For example the stack is at its maximum at Z at the
2760 end of the pattern, rather than at X in the following:
2762 /(((X)+)+)+....(Y)+....Z/
2764 The only exceptions to this are lookahead/behind assertions and the cut,
2765 (?>A), which pop all the backtrack states associated with A before
2768 Bascktrack state structs are allocated in slabs of about 4K in size.
2769 PL_regmatch_state and st always point to the currently active state,
2770 and PL_regmatch_slab points to the slab currently containing
2771 PL_regmatch_state. The first time regmatch() is called, the first slab is
2772 allocated, and is never freed until interpreter destruction. When the slab
2773 is full, a new one is allocated and chained to the end. At exit from
2774 regmatch(), slabs allocated since entry are freed.
2779 #define DEBUG_STATE_pp(pp) \
2781 DUMP_EXEC_POS(locinput, scan, utf8_target); \
2782 PerlIO_printf(Perl_debug_log, \
2783 " %*s"pp" %s%s%s%s%s\n", \
2785 PL_reg_name[st->resume_state], \
2786 ((st==yes_state||st==mark_state) ? "[" : ""), \
2787 ((st==yes_state) ? "Y" : ""), \
2788 ((st==mark_state) ? "M" : ""), \
2789 ((st==yes_state||st==mark_state) ? "]" : "") \
2794 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2799 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2800 const char *start, const char *end, const char *blurb)
2802 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2804 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2809 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2810 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2812 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2813 start, end - start, 60);
2815 PerlIO_printf(Perl_debug_log,
2816 "%s%s REx%s %s against %s\n",
2817 PL_colors[4], blurb, PL_colors[5], s0, s1);
2819 if (utf8_target||utf8_pat)
2820 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2821 utf8_pat ? "pattern" : "",
2822 utf8_pat && utf8_target ? " and " : "",
2823 utf8_target ? "string" : ""
2829 S_dump_exec_pos(pTHX_ const char *locinput,
2830 const regnode *scan,
2831 const char *loc_regeol,
2832 const char *loc_bostr,
2833 const char *loc_reg_starttry,
2834 const bool utf8_target)
2836 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2837 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2838 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2839 /* The part of the string before starttry has one color
2840 (pref0_len chars), between starttry and current
2841 position another one (pref_len - pref0_len chars),
2842 after the current position the third one.
2843 We assume that pref0_len <= pref_len, otherwise we
2844 decrease pref0_len. */
2845 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2846 ? (5 + taill) - l : locinput - loc_bostr;
2849 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2851 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2853 pref0_len = pref_len - (locinput - loc_reg_starttry);
2854 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2855 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2856 ? (5 + taill) - pref_len : loc_regeol - locinput);
2857 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2861 if (pref0_len > pref_len)
2862 pref0_len = pref_len;
2864 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2866 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2867 (locinput - pref_len),pref0_len, 60, 4, 5);
2869 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2870 (locinput - pref_len + pref0_len),
2871 pref_len - pref0_len, 60, 2, 3);
2873 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2874 locinput, loc_regeol - locinput, 10, 0, 1);
2876 const STRLEN tlen=len0+len1+len2;
2877 PerlIO_printf(Perl_debug_log,
2878 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2879 (IV)(locinput - loc_bostr),
2882 (docolor ? "" : "> <"),
2884 (int)(tlen > 19 ? 0 : 19 - tlen),
2891 /* reg_check_named_buff_matched()
2892 * Checks to see if a named buffer has matched. The data array of
2893 * buffer numbers corresponding to the buffer is expected to reside
2894 * in the regexp->data->data array in the slot stored in the ARG() of
2895 * node involved. Note that this routine doesn't actually care about the
2896 * name, that information is not preserved from compilation to execution.
2897 * Returns the index of the leftmost defined buffer with the given name
2898 * or 0 if non of the buffers matched.
2901 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2904 RXi_GET_DECL(rex,rexi);
2905 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2906 I32 *nums=(I32*)SvPVX(sv_dat);
2908 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2910 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2911 if ((I32)*PL_reglastparen >= nums[n] &&
2912 PL_regoffs[nums[n]].end != -1)
2921 /* free all slabs above current one - called during LEAVE_SCOPE */
2924 S_clear_backtrack_stack(pTHX_ void *p)
2926 regmatch_slab *s = PL_regmatch_slab->next;
2931 PL_regmatch_slab->next = NULL;
2933 regmatch_slab * const osl = s;
2940 #define SETREX(Re1,Re2) \
2941 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2944 STATIC I32 /* 0 failure, 1 success */
2945 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2947 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2951 register const bool utf8_target = PL_reg_match_utf8;
2952 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2953 REGEXP *rex_sv = reginfo->prog;
2954 regexp *rex = (struct regexp *)SvANY(rex_sv);
2955 RXi_GET_DECL(rex,rexi);
2957 /* the current state. This is a cached copy of PL_regmatch_state */
2958 register regmatch_state *st;
2959 /* cache heavy used fields of st in registers */
2960 register regnode *scan;
2961 register regnode *next;
2962 register U32 n = 0; /* general value; init to avoid compiler warning */
2963 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2964 register char *locinput = PL_reginput;
2965 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2967 bool result = 0; /* return value of S_regmatch */
2968 int depth = 0; /* depth of backtrack stack */
2969 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2970 const U32 max_nochange_depth =
2971 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2972 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2973 regmatch_state *yes_state = NULL; /* state to pop to on success of
2975 /* mark_state piggy backs on the yes_state logic so that when we unwind
2976 the stack on success we can update the mark_state as we go */
2977 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2978 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2979 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2981 bool no_final = 0; /* prevent failure from backtracking? */
2982 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2983 char *startpoint = PL_reginput;
2984 SV *popmark = NULL; /* are we looking for a mark? */
2985 SV *sv_commit = NULL; /* last mark name seen in failure */
2986 SV *sv_yes_mark = NULL; /* last mark name we have seen
2987 during a successfull match */
2988 U32 lastopen = 0; /* last open we saw */
2989 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2990 SV* const oreplsv = GvSV(PL_replgv);
2991 /* these three flags are set by various ops to signal information to
2992 * the very next op. They have a useful lifetime of exactly one loop
2993 * iteration, and are not preserved or restored by state pushes/pops
2995 bool sw = 0; /* the condition value in (?(cond)a|b) */
2996 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2997 int logical = 0; /* the following EVAL is:
3001 or the following IFMATCH/UNLESSM is:
3002 false: plain (?=foo)
3003 true: used as a condition: (?(?=foo))
3006 GET_RE_DEBUG_FLAGS_DECL;
3009 PERL_ARGS_ASSERT_REGMATCH;
3011 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3012 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3014 /* on first ever call to regmatch, allocate first slab */
3015 if (!PL_regmatch_slab) {
3016 Newx(PL_regmatch_slab, 1, regmatch_slab);
3017 PL_regmatch_slab->prev = NULL;
3018 PL_regmatch_slab->next = NULL;
3019 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3022 oldsave = PL_savestack_ix;
3023 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3024 SAVEVPTR(PL_regmatch_slab);
3025 SAVEVPTR(PL_regmatch_state);
3027 /* grab next free state slot */
3028 st = ++PL_regmatch_state;
3029 if (st > SLAB_LAST(PL_regmatch_slab))
3030 st = PL_regmatch_state = S_push_slab(aTHX);
3032 /* Note that nextchr is a byte even in UTF */
3033 nextchr = UCHARAT(locinput);
3035 while (scan != NULL) {
3038 SV * const prop = sv_newmortal();
3039 regnode *rnext=regnext(scan);
3040 DUMP_EXEC_POS( locinput, scan, utf8_target );
3041 regprop(rex, prop, scan);
3043 PerlIO_printf(Perl_debug_log,
3044 "%3"IVdf":%*s%s(%"IVdf")\n",
3045 (IV)(scan - rexi->program), depth*2, "",
3047 (PL_regkind[OP(scan)] == END || !rnext) ?
3048 0 : (IV)(rnext - rexi->program));
3051 next = scan + NEXT_OFF(scan);
3054 state_num = OP(scan);
3056 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3059 assert(PL_reglastparen == &rex->lastparen);
3060 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3061 assert(PL_regoffs == rex->offs);
3063 switch (state_num) {
3065 if (locinput == PL_bostr)
3067 /* reginfo->till = reginfo->bol; */
3072 if (locinput == PL_bostr ||
3073 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3079 if (locinput == PL_bostr)
3083 if (locinput == reginfo->ganch)
3088 /* update the startpoint */
3089 st->u.keeper.val = PL_regoffs[0].start;
3090 PL_reginput = locinput;
3091 PL_regoffs[0].start = locinput - PL_bostr;
3092 PUSH_STATE_GOTO(KEEPS_next, next);
3094 case KEEPS_next_fail:
3095 /* rollback the start point change */
3096 PL_regoffs[0].start = st->u.keeper.val;
3102 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3107 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3109 if (PL_regeol - locinput > 1)
3113 if (PL_regeol != locinput)
3117 if (!nextchr && locinput >= PL_regeol)
3120 locinput += PL_utf8skip[nextchr];
3121 if (locinput > PL_regeol)
3123 nextchr = UCHARAT(locinput);
3126 nextchr = UCHARAT(++locinput);
3129 if (!nextchr && locinput >= PL_regeol)
3131 nextchr = UCHARAT(++locinput);
3134 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3137 locinput += PL_utf8skip[nextchr];
3138 if (locinput > PL_regeol)
3140 nextchr = UCHARAT(locinput);
3143 nextchr = UCHARAT(++locinput);
3147 #define ST st->u.trie
3149 /* In this case the charclass data is available inline so
3150 we can fail fast without a lot of extra overhead.
3152 if (scan->flags == EXACT || !utf8_target) {
3153 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3155 PerlIO_printf(Perl_debug_log,
3156 "%*s %sfailed to match trie start class...%s\n",
3157 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3165 /* the basic plan of execution of the trie is:
3166 * At the beginning, run though all the states, and
3167 * find the longest-matching word. Also remember the position
3168 * of the shortest matching word. For example, this pattern:
3171 * when matched against the string "abcde", will generate
3172 * accept states for all words except 3, with the longest
3173 * matching word being 4, and the shortest being 1 (with
3174 * the position being after char 1 of the string).
3176 * Then for each matching word, in word order (i.e. 1,2,4,5),
3177 * we run the remainder of the pattern; on each try setting
3178 * the current position to the character following the word,
3179 * returning to try the next word on failure.
3181 * We avoid having to build a list of words at runtime by
3182 * using a compile-time structure, wordinfo[].prev, which
3183 * gives, for each word, the previous accepting word (if any).
3184 * In the case above it would contain the mappings 1->2, 2->0,
3185 * 3->0, 4->5, 5->1. We can use this table to generate, from
3186 * the longest word (4 above), a list of all words, by
3187 * following the list of prev pointers; this gives us the
3188 * unordered list 4,5,1,2. Then given the current word we have
3189 * just tried, we can go through the list and find the
3190 * next-biggest word to try (so if we just failed on word 2,
3191 * the next in the list is 4).
3193 * Since at runtime we don't record the matching position in
3194 * the string for each word, we have to work that out for
3195 * each word we're about to process. The wordinfo table holds
3196 * the character length of each word; given that we recorded
3197 * at the start: the position of the shortest word and its
3198 * length in chars, we just need to move the pointer the
3199 * difference between the two char lengths. Depending on
3200 * Unicode status and folding, that's cheap or expensive.
3202 * This algorithm is optimised for the case where are only a
3203 * small number of accept states, i.e. 0,1, or maybe 2.
3204 * With lots of accepts states, and having to try all of them,
3205 * it becomes quadratic on number of accept states to find all
3210 /* what type of TRIE am I? (utf8 makes this contextual) */
3211 DECL_TRIE_TYPE(scan);
3213 /* what trie are we using right now */
3214 reg_trie_data * const trie
3215 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3216 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3217 U32 state = trie->startstate;
3219 if (trie->bitmap && trie_type != trie_utf8_fold &&
3220 !TRIE_BITMAP_TEST(trie,*locinput)
3222 if (trie->states[ state ].wordnum) {
3224 PerlIO_printf(Perl_debug_log,
3225 "%*s %smatched empty string...%s\n",
3226 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3232 PerlIO_printf(Perl_debug_log,
3233 "%*s %sfailed to match trie start class...%s\n",
3234 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3241 U8 *uc = ( U8* )locinput;
3245 U8 *uscan = (U8*)NULL;
3246 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3247 U32 charcount = 0; /* how many input chars we have matched */
3248 U32 accepted = 0; /* have we seen any accepting states? */
3251 ST.jump = trie->jump;
3254 ST.longfold = FALSE; /* char longer if folded => it's harder */
3257 /* fully traverse the TRIE; note the position of the
3258 shortest accept state and the wordnum of the longest
3261 while ( state && uc <= (U8*)PL_regeol ) {
3262 U32 base = trie->states[ state ].trans.base;
3266 wordnum = trie->states[ state ].wordnum;
3268 if (wordnum) { /* it's an accept state */
3271 /* record first match position */
3273 ST.firstpos = (U8*)locinput;
3278 ST.firstchars = charcount;
3281 if (!ST.nextword || wordnum < ST.nextword)
3282 ST.nextword = wordnum;
3283 ST.topword = wordnum;
3286 DEBUG_TRIE_EXECUTE_r({
3287 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3288 PerlIO_printf( Perl_debug_log,
3289 "%*s %sState: %4"UVxf" Accepted: %c ",
3290 2+depth * 2, "", PL_colors[4],
3291 (UV)state, (accepted ? 'Y' : 'N'));
3294 /* read a char and goto next state */
3297 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3298 uscan, len, uvc, charid, foldlen,
3305 base + charid - 1 - trie->uniquecharcount)) >= 0)
3307 && ((U32)offset < trie->lasttrans)
3308 && trie->trans[offset].check == state)
3310 state = trie->trans[offset].next;
3321 DEBUG_TRIE_EXECUTE_r(
3322 PerlIO_printf( Perl_debug_log,
3323 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3324 charid, uvc, (UV)state, PL_colors[5] );
3330 /* calculate total number of accept states */
3335 w = trie->wordinfo[w].prev;
3338 ST.accepted = accepted;
3342 PerlIO_printf( Perl_debug_log,
3343 "%*s %sgot %"IVdf" possible matches%s\n",
3344 REPORT_CODE_OFF + depth * 2, "",
3345 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3347 goto trie_first_try; /* jump into the fail handler */
3351 case TRIE_next_fail: /* we failed - try next alternative */
3353 REGCP_UNWIND(ST.cp);
3354 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3355 PL_regoffs[n].end = -1;
3356 *PL_reglastparen = n;
3358 if (!--ST.accepted) {
3360 PerlIO_printf( Perl_debug_log,
3361 "%*s %sTRIE failed...%s\n",
3362 REPORT_CODE_OFF+depth*2, "",
3369 /* Find next-highest word to process. Note that this code
3370 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3371 register U16 min = 0;
3373 register U16 const nextword = ST.nextword;
3374 register reg_trie_wordinfo * const wordinfo
3375 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3376 for (word=ST.topword; word; word=wordinfo[word].prev) {
3377 if (word > nextword && (!min || word < min))
3390 ST.lastparen = *PL_reglastparen;
3394 /* find start char of end of current word */
3396 U32 chars; /* how many chars to skip */
3397 U8 *uc = ST.firstpos;
3398 reg_trie_data * const trie
3399 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3401 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3403 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3407 /* the hard option - fold each char in turn and find
3408 * its folded length (which may be different */
3409 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3417 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3425 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3430 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3444 PL_reginput = (char *)uc;
3447 scan = (ST.jump && ST.jump[ST.nextword])
3448 ? ST.me + ST.jump[ST.nextword]
3452 PerlIO_printf( Perl_debug_log,
3453 "%*s %sTRIE matched word #%d, continuing%s\n",
3454 REPORT_CODE_OFF+depth*2, "",
3461 if (ST.accepted > 1 || has_cutgroup) {
3462 PUSH_STATE_GOTO(TRIE_next, scan);
3465 /* only one choice left - just continue */
3467 AV *const trie_words
3468 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3469 SV ** const tmp = av_fetch( trie_words,
3471 SV *sv= tmp ? sv_newmortal() : NULL;
3473 PerlIO_printf( Perl_debug_log,
3474 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3475 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3477 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3478 PL_colors[0], PL_colors[1],
3479 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3481 : "not compiled under -Dr",
3485 locinput = PL_reginput;
3486 nextchr = UCHARAT(locinput);
3487 continue; /* execute rest of RE */
3492 char *s = STRING(scan);
3494 if (utf8_target != UTF_PATTERN) {
3495 /* The target and the pattern have differing utf8ness. */
3497 const char * const e = s + ln;
3500 /* The target is utf8, the pattern is not utf8. */
3505 if (NATIVE_TO_UNI(*(U8*)s) !=
3506 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3514 /* The target is not utf8, the pattern is utf8. */
3519 if (NATIVE_TO_UNI(*((U8*)l)) !=
3520 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3528 nextchr = UCHARAT(locinput);
3531 /* The target and the pattern have the same utf8ness. */
3532 /* Inline the first character, for speed. */
3533 if (UCHARAT(s) != nextchr)
3535 if (PL_regeol - locinput < ln)
3537 if (ln > 1 && memNE(s, locinput, ln))
3540 nextchr = UCHARAT(locinput);
3544 PL_reg_flags |= RF_tainted;
3547 char * const s = STRING(scan);
3550 if (utf8_target || UTF_PATTERN) {
3551 /* Either target or the pattern are utf8. */
3552 const char * const l = locinput;
3553 char *e = PL_regeol;
3555 if (! foldEQ_utf8(s, 0, ln, cBOOL(UTF_PATTERN),
3556 l, &e, 0, utf8_target)) {
3557 /* One more case for the sharp s:
3558 * pack("U0U*", 0xDF) =~ /ss/i,
3559 * the 0xC3 0x9F are the UTF-8
3560 * byte sequence for the U+00DF. */
3562 if (!(utf8_target &&
3563 toLOWER(s[0]) == 's' &&
3565 toLOWER(s[1]) == 's' &&
3572 nextchr = UCHARAT(locinput);
3576 /* Neither the target and the pattern are utf8. */
3578 /* Inline the first character, for speed. */
3579 if (UCHARAT(s) != nextchr &&
3580 UCHARAT(s) != ((OP(scan) == EXACTF)
3581 ? PL_fold : PL_fold_locale)[nextchr])
3583 if (PL_regeol - locinput < ln)
3585 if (ln > 1 && (OP(scan) == EXACTF
3586 ? ! foldEQ(s, locinput, ln)
3587 : ! foldEQ_locale(s, locinput, ln)))
3590 nextchr = UCHARAT(locinput);
3595 PL_reg_flags |= RF_tainted;
3599 /* was last char in word? */
3601 if (locinput == PL_bostr)
3604 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3606 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3608 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3609 ln = isALNUM_uni(ln);
3610 LOAD_UTF8_CHARCLASS_ALNUM();
3611 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3614 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3615 n = isALNUM_LC_utf8((U8*)locinput);
3619 ln = (locinput != PL_bostr) ?
3620 UCHARAT(locinput - 1) : '\n';
3621 if (FLAGS(scan) & USE_UNI) {
3623 /* Here, can't be BOUNDL or NBOUNDL because they never set
3624 * the flags to USE_UNI */
3625 ln = isWORDCHAR_L1(ln);
3626 n = isWORDCHAR_L1(nextchr);
3628 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3630 n = isALNUM(nextchr);
3633 ln = isALNUM_LC(ln);
3634 n = isALNUM_LC(nextchr);
3637 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3638 OP(scan) == BOUNDL))
3643 STRLEN inclasslen = PL_regeol - locinput;
3644 if (locinput >= PL_regeol)
3647 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3649 locinput += inclasslen;
3650 nextchr = UCHARAT(locinput);
3655 nextchr = UCHARAT(locinput);
3656 if (!nextchr && locinput >= PL_regeol)
3658 if (!REGINCLASS(rex, scan, (U8*)locinput))
3660 nextchr = UCHARAT(++locinput);
3664 /* If we might have the case of the German sharp s
3665 * in a casefolding Unicode character class. */
3667 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3668 locinput += SHARP_S_SKIP;
3669 nextchr = UCHARAT(locinput);
3674 /* Special char classes - The defines start on line 129 or so */
3675 CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3676 CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3678 CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3679 CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3681 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3682 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3684 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3685 a Unicode extended Grapheme Cluster */
3686 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3687 extended Grapheme Cluster is:
3690 | Prepend* Begin Extend*
3693 Begin is (Hangul-syllable | ! Control)
3694 Extend is (Grapheme_Extend | Spacing_Mark)
3695 Control is [ GCB_Control CR LF ]
3697 The discussion below shows how the code for CLUMP is derived
3698 from this regex. Note that most of these concepts are from
3699 property values of the Grapheme Cluster Boundary (GCB) property.
3700 No code point can have multiple property values for a given
3701 property. Thus a code point in Prepend can't be in Control, but
3702 it must be in !Control. This is why Control above includes
3703 GCB_Control plus CR plus LF. The latter two are used in the GCB
3704 property separately, and so can't be in GCB_Control, even though
3705 they logically are controls. Control is not the same as gc=cc,
3706 but includes format and other characters as well.
3708 The Unicode definition of Hangul-syllable is:
3710 | (L* ( ( V | LV ) V* | LVT ) T*)
3713 Each of these is a value for the GCB property, and hence must be
3714 disjoint, so the order they are tested is immaterial, so the
3715 above can safely be changed to
3718 | (L* ( LVT | ( V | LV ) V*) T*)
3720 The last two terms can be combined like this:
3722 | (( LVT | ( V | LV ) V*) T*))
3724 And refactored into this:
3725 L* (L | LVT T* | V V* T* | LV V* T*)
3727 That means that if we have seen any L's at all we can quit
3728 there, but if the next character is a LVT, a V or and LV we
3731 There is a subtlety with Prepend* which showed up in testing.
3732 Note that the Begin, and only the Begin is required in:
3733 | Prepend* Begin Extend*
3734 Also, Begin contains '! Control'. A Prepend must be a '!
3735 Control', which means it must be a Begin. What it comes down to
3736 is that if we match Prepend* and then find no suitable Begin
3737 afterwards, that if we backtrack the last Prepend, that one will
3738 be a suitable Begin.
3741 if (locinput >= PL_regeol)
3743 if (! utf8_target) {
3745 /* Match either CR LF or '.', as all the other possibilities
3747 locinput++; /* Match the . or CR */
3749 && locinput < PL_regeol
3750 && UCHARAT(locinput) == '\n') locinput++;
3754 /* Utf8: See if is ( CR LF ); already know that locinput <
3755 * PL_regeol, so locinput+1 is in bounds */
3756 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3760 /* In case have to backtrack to beginning, then match '.' */
3761 char *starting = locinput;
3763 /* In case have to backtrack the last prepend */
3764 char *previous_prepend = 0;
3766 LOAD_UTF8_CHARCLASS_GCB();
3768 /* Match (prepend)* */
3769 while (locinput < PL_regeol
3770 && swash_fetch(PL_utf8_X_prepend,
3771 (U8*)locinput, utf8_target))
3773 previous_prepend = locinput;
3774 locinput += UTF8SKIP(locinput);
3777 /* As noted above, if we matched a prepend character, but
3778 * the next thing won't match, back off the last prepend we
3779 * matched, as it is guaranteed to match the begin */
3780 if (previous_prepend
3781 && (locinput >= PL_regeol
3782 || ! swash_fetch(PL_utf8_X_begin,
3783 (U8*)locinput, utf8_target)))
3785 locinput = previous_prepend;
3788 /* Note that here we know PL_regeol > locinput, as we
3789 * tested that upon input to this switch case, and if we
3790 * moved locinput forward, we tested the result just above
3791 * and it either passed, or we backed off so that it will
3793 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3795 /* Here did not match the required 'Begin' in the
3796 * second term. So just match the very first
3797 * character, the '.' of the final term of the regex */
3798 locinput = starting + UTF8SKIP(starting);
3801 /* Here is the beginning of a character that can have
3802 * an extender. It is either a hangul syllable, or a
3804 if (swash_fetch(PL_utf8_X_non_hangul,
3805 (U8*)locinput, utf8_target))
3808 /* Here not a Hangul syllable, must be a
3809 * ('! * Control') */
3810 locinput += UTF8SKIP(locinput);
3813 /* Here is a Hangul syllable. It can be composed
3814 * of several individual characters. One
3815 * possibility is T+ */
3816 if (swash_fetch(PL_utf8_X_T,
3817 (U8*)locinput, utf8_target))
3819 while (locinput < PL_regeol
3820 && swash_fetch(PL_utf8_X_T,
3821 (U8*)locinput, utf8_target))
3823 locinput += UTF8SKIP(locinput);
3827 /* Here, not T+, but is a Hangul. That means
3828 * it is one of the others: L, LV, LVT or V,
3830 * L* (L | LVT T* | V V* T* | LV V* T*) */
3833 while (locinput < PL_regeol
3834 && swash_fetch(PL_utf8_X_L,
3835 (U8*)locinput, utf8_target))
3837 locinput += UTF8SKIP(locinput);
3840 /* Here, have exhausted L*. If the next
3841 * character is not an LV, LVT nor V, it means
3842 * we had to have at least one L, so matches L+
3843 * in the original equation, we have a complete
3844 * hangul syllable. Are done. */
3846 if (locinput < PL_regeol
3847 && swash_fetch(PL_utf8_X_LV_LVT_V,
3848 (U8*)locinput, utf8_target))
3851 /* Otherwise keep going. Must be LV, LVT
3852 * or V. See if LVT */
3853 if (swash_fetch(PL_utf8_X_LVT,
3854 (U8*)locinput, utf8_target))
3856 locinput += UTF8SKIP(locinput);
3859 /* Must be V or LV. Take it, then
3861 locinput += UTF8SKIP(locinput);
3862 while (locinput < PL_regeol
3863 && swash_fetch(PL_utf8_X_V,
3864 (U8*)locinput, utf8_target))
3866 locinput += UTF8SKIP(locinput);
3870 /* And any of LV, LVT, or V can be followed
3872 while (locinput < PL_regeol
3873 && swash_fetch(PL_utf8_X_T,
3877 locinput += UTF8SKIP(locinput);
3883 /* Match any extender */
3884 while (locinput < PL_regeol
3885 && swash_fetch(PL_utf8_X_extend,
3886 (U8*)locinput, utf8_target))
3888 locinput += UTF8SKIP(locinput);
3892 if (locinput > PL_regeol) sayNO;
3894 nextchr = UCHARAT(locinput);
3901 PL_reg_flags |= RF_tainted;
3906 n = reg_check_named_buff_matched(rex,scan);
3909 type = REF + ( type - NREF );
3916 PL_reg_flags |= RF_tainted;
3920 n = ARG(scan); /* which paren pair */
3923 ln = PL_regoffs[n].start;
3924 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3925 if (*PL_reglastparen < n || ln == -1)
3926 sayNO; /* Do not match unless seen CLOSEn. */
3927 if (ln == PL_regoffs[n].end)
3931 if (utf8_target && type != REF) { /* REF can do byte comparison */
3933 const char *e = PL_bostr + PL_regoffs[n].end;
3935 * Note that we can't do the "other character" lookup trick as
3936 * in the 8-bit case (no pun intended) because in Unicode we
3937 * have to map both upper and title case to lower case.
3941 STRLEN ulen1, ulen2;
3942 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3943 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3947 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3948 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3949 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3956 nextchr = UCHARAT(locinput);
3960 /* Inline the first character, for speed. */
3961 if (UCHARAT(s) != nextchr &&
3963 (UCHARAT(s) != (type == REFF
3964 ? PL_fold : PL_fold_locale)[nextchr])))
3966 ln = PL_regoffs[n].end - ln;
3967 if (locinput + ln > PL_regeol)
3969 if (ln > 1 && (type == REF
3970 ? memNE(s, locinput, ln)
3972 ? ! foldEQ(s, locinput, ln)
3973 : ! foldEQ_locale(s, locinput, ln))))
3976 nextchr = UCHARAT(locinput);
3986 #define ST st->u.eval
3991 regexp_internal *rei;
3992 regnode *startpoint;
3995 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3996 if (cur_eval && cur_eval->locinput==locinput) {
3997 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3998 Perl_croak(aTHX_ "Infinite recursion in regex");
3999 if ( ++nochange_depth > max_nochange_depth )
4001 "Pattern subroutine nesting without pos change"
4002 " exceeded limit in regex");
4009 (void)ReREFCNT_inc(rex_sv);
4010 if (OP(scan)==GOSUB) {
4011 startpoint = scan + ARG2L(scan);
4012 ST.close_paren = ARG(scan);
4014 startpoint = rei->program+1;
4017 goto eval_recurse_doit;
4019 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4020 if (cur_eval && cur_eval->locinput==locinput) {
4021 if ( ++nochange_depth > max_nochange_depth )
4022 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4027 /* execute the code in the {...} */
4029 SV ** const before = SP;
4030 OP_4tree * const oop = PL_op;
4031 COP * const ocurcop = PL_curcop;
4033 char *saved_regeol = PL_regeol;
4034 struct re_save_state saved_state;
4036 /* To not corrupt the existing regex state while executing the
4037 * eval we would normally put it on the save stack, like with
4038 * save_re_context. However, re-evals have a weird scoping so we
4039 * can't just add ENTER/LEAVE here. With that, things like
4041 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4043 * would break, as they expect the localisation to be unwound
4044 * only when the re-engine backtracks through the bit that
4047 * What we do instead is just saving the state in a local c
4050 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4053 PL_op = (OP_4tree*)rexi->data->data[n];
4054 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4055 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4056 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4057 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4060 SV *sv_mrk = get_sv("REGMARK", 1);
4061 sv_setsv(sv_mrk, sv_yes_mark);
4064 CALLRUNOPS(aTHX); /* Scalar context. */
4067 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4073 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4076 PAD_RESTORE_LOCAL(old_comppad);
4077 PL_curcop = ocurcop;
4078 PL_regeol = saved_regeol;
4081 sv_setsv(save_scalar(PL_replgv), ret);
4085 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4088 /* extract RE object from returned value; compiling if
4094 SV *const sv = SvRV(ret);
4096 if (SvTYPE(sv) == SVt_REGEXP) {
4098 } else if (SvSMAGICAL(sv)) {
4099 mg = mg_find(sv, PERL_MAGIC_qr);
4102 } else if (SvTYPE(ret) == SVt_REGEXP) {
4104 } else if (SvSMAGICAL(ret)) {
4105 if (SvGMAGICAL(ret)) {
4106 /* I don't believe that there is ever qr magic
4108 assert(!mg_find(ret, PERL_MAGIC_qr));
4109 sv_unmagic(ret, PERL_MAGIC_qr);
4112 mg = mg_find(ret, PERL_MAGIC_qr);
4113 /* testing suggests mg only ends up non-NULL for
4114 scalars who were upgraded and compiled in the
4115 else block below. In turn, this is only
4116 triggered in the "postponed utf8 string" tests
4122 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4126 rx = reg_temp_copy(NULL, rx);
4130 const I32 osize = PL_regsize;
4133 assert (SvUTF8(ret));
4134 } else if (SvUTF8(ret)) {
4135 /* Not doing UTF-8, despite what the SV says. Is
4136 this only if we're trapped in use 'bytes'? */
4137 /* Make a copy of the octet sequence, but without
4138 the flag on, as the compiler now honours the
4139 SvUTF8 flag on ret. */
4141 const char *const p = SvPV(ret, len);
4142 ret = newSVpvn_flags(p, len, SVs_TEMP);
4144 rx = CALLREGCOMP(ret, pm_flags);
4146 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4148 /* This isn't a first class regexp. Instead, it's
4149 caching a regexp onto an existing, Perl visible
4151 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4156 re = (struct regexp *)SvANY(rx);
4158 RXp_MATCH_COPIED_off(re);
4159 re->subbeg = rex->subbeg;
4160 re->sublen = rex->sublen;
4163 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4164 "Matching embedded");
4166 startpoint = rei->program + 1;
4167 ST.close_paren = 0; /* only used for GOSUB */
4168 /* borrowed from regtry */
4169 if (PL_reg_start_tmpl <= re->nparens) {
4170 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4171 if(PL_reg_start_tmp)
4172 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4174 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4177 eval_recurse_doit: /* Share code with GOSUB below this line */
4178 /* run the pattern returned from (??{...}) */
4179 ST.cp = regcppush(0); /* Save *all* the positions. */
4180 REGCP_SET(ST.lastcp);
4182 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4184 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4185 PL_reglastparen = &re->lastparen;
4186 PL_reglastcloseparen = &re->lastcloseparen;
4188 re->lastcloseparen = 0;
4190 PL_reginput = locinput;
4193 /* XXXX This is too dramatic a measure... */
4196 ST.toggle_reg_flags = PL_reg_flags;
4198 PL_reg_flags |= RF_utf8;
4200 PL_reg_flags &= ~RF_utf8;
4201 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4203 ST.prev_rex = rex_sv;
4204 ST.prev_curlyx = cur_curlyx;
4205 SETREX(rex_sv,re_sv);
4210 ST.prev_eval = cur_eval;
4212 /* now continue from first node in postoned RE */
4213 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4216 /* logical is 1, /(?(?{...})X|Y)/ */
4217 sw = cBOOL(SvTRUE(ret));
4222 case EVAL_AB: /* cleanup after a successful (??{A})B */
4223 /* note: this is called twice; first after popping B, then A */
4224 PL_reg_flags ^= ST.toggle_reg_flags;
4225 ReREFCNT_dec(rex_sv);
4226 SETREX(rex_sv,ST.prev_rex);
4227 rex = (struct regexp *)SvANY(rex_sv);
4228 rexi = RXi_GET(rex);
4230 cur_eval = ST.prev_eval;
4231 cur_curlyx = ST.prev_curlyx;
4233 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4234 PL_reglastparen = &rex->lastparen;
4235 PL_reglastcloseparen = &rex->lastcloseparen;
4236 /* also update PL_regoffs */
4237 PL_regoffs = rex->offs;
4239 /* XXXX This is too dramatic a measure... */
4241 if ( nochange_depth )
4246 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4247 /* note: this is called twice; first after popping B, then A */
4248 PL_reg_flags ^= ST.toggle_reg_flags;
4249 ReREFCNT_dec(rex_sv);
4250 SETREX(rex_sv,ST.prev_rex);
4251 rex = (struct regexp *)SvANY(rex_sv);
4252 rexi = RXi_GET(rex);
4253 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4254 PL_reglastparen = &rex->lastparen;
4255 PL_reglastcloseparen = &rex->lastcloseparen;
4257 PL_reginput = locinput;
4258 REGCP_UNWIND(ST.lastcp);
4260 cur_eval = ST.prev_eval;
4261 cur_curlyx = ST.prev_curlyx;
4262 /* XXXX This is too dramatic a measure... */
4264 if ( nochange_depth )
4270 n = ARG(scan); /* which paren pair */
4271 PL_reg_start_tmp[n] = locinput;
4277 n = ARG(scan); /* which paren pair */
4278 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4279 PL_regoffs[n].end = locinput - PL_bostr;
4280 /*if (n > PL_regsize)
4282 if (n > *PL_reglastparen)
4283 *PL_reglastparen = n;
4284 *PL_reglastcloseparen = n;
4285 if (cur_eval && cur_eval->u.eval.close_paren == n) {
4293 cursor && OP(cursor)!=END;
4294 cursor=regnext(cursor))
4296 if ( OP(cursor)==CLOSE ){
4298 if ( n <= lastopen ) {
4300 = PL_reg_start_tmp[n] - PL_bostr;
4301 PL_regoffs[n].end = locinput - PL_bostr;
4302 /*if (n > PL_regsize)
4304 if (n > *PL_reglastparen)
4305 *PL_reglastparen = n;
4306 *PL_reglastcloseparen = n;
4307 if ( n == ARG(scan) || (cur_eval &&
4308 cur_eval->u.eval.close_paren == n))
4317 n = ARG(scan); /* which paren pair */
4318 sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4321 /* reg_check_named_buff_matched returns 0 for no match */
4322 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4326 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4332 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4334 next = NEXTOPER(NEXTOPER(scan));
4336 next = scan + ARG(scan);
4337 if (OP(next) == IFTHEN) /* Fake one. */
4338 next = NEXTOPER(NEXTOPER(next));
4342 logical = scan->flags;
4345 /*******************************************************************
4347 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4348 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4349 STAR/PLUS/CURLY/CURLYN are used instead.)
4351 A*B is compiled as <CURLYX><A><WHILEM><B>
4353 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4354 state, which contains the current count, initialised to -1. It also sets
4355 cur_curlyx to point to this state, with any previous value saved in the
4358 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4359 since the pattern may possibly match zero times (i.e. it's a while {} loop
4360 rather than a do {} while loop).
4362 Each entry to WHILEM represents a successful match of A. The count in the
4363 CURLYX block is incremented, another WHILEM state is pushed, and execution
4364 passes to A or B depending on greediness and the current count.
4366 For example, if matching against the string a1a2a3b (where the aN are
4367 substrings that match /A/), then the match progresses as follows: (the
4368 pushed states are interspersed with the bits of strings matched so far):
4371 <CURLYX cnt=0><WHILEM>
4372 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4373 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4374 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4375 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4377 (Contrast this with something like CURLYM, which maintains only a single
4381 a1 <CURLYM cnt=1> a2
4382 a1 a2 <CURLYM cnt=2> a3
4383 a1 a2 a3 <CURLYM cnt=3> b
4386 Each WHILEM state block marks a point to backtrack to upon partial failure
4387 of A or B, and also contains some minor state data related to that
4388 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4389 overall state, such as the count, and pointers to the A and B ops.
4391 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4392 must always point to the *current* CURLYX block, the rules are:
4394 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4395 and set cur_curlyx to point the new block.
4397 When popping the CURLYX block after a successful or unsuccessful match,
4398 restore the previous cur_curlyx.
4400 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4401 to the outer one saved in the CURLYX block.
4403 When popping the WHILEM block after a successful or unsuccessful B match,
4404 restore the previous cur_curlyx.
4406 Here's an example for the pattern (AI* BI)*BO
4407 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4410 curlyx backtrack stack
4411 ------ ---------------
4413 CO <CO prev=NULL> <WO>
4414 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4415 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4416 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4418 At this point the pattern succeeds, and we work back down the stack to
4419 clean up, restoring as we go:
4421 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4422 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4423 CO <CO prev=NULL> <WO>
4426 *******************************************************************/
4428 #define ST st->u.curlyx
4430 case CURLYX: /* start of /A*B/ (for complex A) */
4432 /* No need to save/restore up to this paren */
4433 I32 parenfloor = scan->flags;
4435 assert(next); /* keep Coverity happy */
4436 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4439 /* XXXX Probably it is better to teach regpush to support
4440 parenfloor > PL_regsize... */
4441 if (parenfloor > (I32)*PL_reglastparen)
4442 parenfloor = *PL_reglastparen; /* Pessimization... */
4444 ST.prev_curlyx= cur_curlyx;
4446 ST.cp = PL_savestack_ix;
4448 /* these fields contain the state of the current curly.
4449 * they are accessed by subsequent WHILEMs */
4450 ST.parenfloor = parenfloor;
4455 ST.count = -1; /* this will be updated by WHILEM */
4456 ST.lastloc = NULL; /* this will be updated by WHILEM */
4458 PL_reginput = locinput;
4459 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4463 case CURLYX_end: /* just finished matching all of A*B */
4464 cur_curlyx = ST.prev_curlyx;
4468 case CURLYX_end_fail: /* just failed to match all of A*B */
4470 cur_curlyx = ST.prev_curlyx;
4476 #define ST st->u.whilem
4478 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4480 /* see the discussion above about CURLYX/WHILEM */
4482 int min = ARG1(cur_curlyx->u.curlyx.me);
4483 int max = ARG2(cur_curlyx->u.curlyx.me);
4484 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4486 assert(cur_curlyx); /* keep Coverity happy */
4487 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4488 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4489 ST.cache_offset = 0;
4492 PL_reginput = locinput;
4494 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4495 "%*s whilem: matched %ld out of %d..%d\n",
4496 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4499 /* First just match a string of min A's. */
4502 cur_curlyx->u.curlyx.lastloc = locinput;
4503 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4507 /* If degenerate A matches "", assume A done. */
4509 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4510 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4511 "%*s whilem: empty match detected, trying continuation...\n",
4512 REPORT_CODE_OFF+depth*2, "")
4514 goto do_whilem_B_max;
4517 /* super-linear cache processing */
4521 if (!PL_reg_maxiter) {
4522 /* start the countdown: Postpone detection until we
4523 * know the match is not *that* much linear. */
4524 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4525 /* possible overflow for long strings and many CURLYX's */
4526 if (PL_reg_maxiter < 0)
4527 PL_reg_maxiter = I32_MAX;
4528 PL_reg_leftiter = PL_reg_maxiter;
4531 if (PL_reg_leftiter-- == 0) {
4532 /* initialise cache */
4533 const I32 size = (PL_reg_maxiter + 7)/8;
4534 if (PL_reg_poscache) {
4535 if ((I32)PL_reg_poscache_size < size) {
4536 Renew(PL_reg_poscache, size, char);
4537 PL_reg_poscache_size = size;
4539 Zero(PL_reg_poscache, size, char);
4542 PL_reg_poscache_size = size;
4543 Newxz(PL_reg_poscache, size, char);
4545 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4546 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4547 PL_colors[4], PL_colors[5])
4551 if (PL_reg_leftiter < 0) {
4552 /* have we already failed at this position? */
4554 offset = (scan->flags & 0xf) - 1
4555 + (locinput - PL_bostr) * (scan->flags>>4);
4556 mask = 1 << (offset % 8);
4558 if (PL_reg_poscache[offset] & mask) {
4559 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4560 "%*s whilem: (cache) already tried at this position...\n",
4561 REPORT_CODE_OFF+depth*2, "")
4563 sayNO; /* cache records failure */
4565 ST.cache_offset = offset;
4566 ST.cache_mask = mask;
4570 /* Prefer B over A for minimal matching. */
4572 if (cur_curlyx->u.curlyx.minmod) {
4573 ST.save_curlyx = cur_curlyx;
4574 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4575 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4576 REGCP_SET(ST.lastcp);
4577 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4581 /* Prefer A over B for maximal matching. */
4583 if (n < max) { /* More greed allowed? */
4584 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4585 cur_curlyx->u.curlyx.lastloc = locinput;
4586 REGCP_SET(ST.lastcp);
4587 PUSH_STATE_GOTO(WHILEM_A_max, A);
4590 goto do_whilem_B_max;
4594 case WHILEM_B_min: /* just matched B in a minimal match */
4595 case WHILEM_B_max: /* just matched B in a maximal match */
4596 cur_curlyx = ST.save_curlyx;
4600 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4601 cur_curlyx = ST.save_curlyx;
4602 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4603 cur_curlyx->u.curlyx.count--;
4607 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4608 REGCP_UNWIND(ST.lastcp);
4611 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4612 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4613 cur_curlyx->u.curlyx.count--;
4617 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4618 REGCP_UNWIND(ST.lastcp);
4619 regcppop(rex); /* Restore some previous $<digit>s? */
4620 PL_reginput = locinput;
4621 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4622 "%*s whilem: failed, trying continuation...\n",
4623 REPORT_CODE_OFF+depth*2, "")
4626 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4627 && ckWARN(WARN_REGEXP)
4628 && !(PL_reg_flags & RF_warned))
4630 PL_reg_flags |= RF_warned;
4631 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4632 "Complex regular subexpression recursion",
4637 ST.save_curlyx = cur_curlyx;
4638 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4639 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4642 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4643 cur_curlyx = ST.save_curlyx;
4644 REGCP_UNWIND(ST.lastcp);
4647 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4648 /* Maximum greed exceeded */
4649 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4650 && ckWARN(WARN_REGEXP)
4651 && !(PL_reg_flags & RF_warned))
4653 PL_reg_flags |= RF_warned;
4654 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4655 "%s limit (%d) exceeded",
4656 "Complex regular subexpression recursion",
4659 cur_curlyx->u.curlyx.count--;
4663 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4664 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4666 /* Try grabbing another A and see if it helps. */
4667 PL_reginput = locinput;
4668 cur_curlyx->u.curlyx.lastloc = locinput;
4669 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4670 REGCP_SET(ST.lastcp);
4671 PUSH_STATE_GOTO(WHILEM_A_min,
4672 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4676 #define ST st->u.branch
4678 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4679 next = scan + ARG(scan);
4682 scan = NEXTOPER(scan);
4685 case BRANCH: /* /(...|A|...)/ */
4686 scan = NEXTOPER(scan); /* scan now points to inner node */
4687 ST.lastparen = *PL_reglastparen;
4688 ST.next_branch = next;
4690 PL_reginput = locinput;
4692 /* Now go into the branch */
4694 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4696 PUSH_STATE_GOTO(BRANCH_next, scan);
4700 PL_reginput = locinput;
4701 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4702 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4703 PUSH_STATE_GOTO(CUTGROUP_next,next);
4705 case CUTGROUP_next_fail:
4708 if (st->u.mark.mark_name)
4709 sv_commit = st->u.mark.mark_name;
4715 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4720 REGCP_UNWIND(ST.cp);
4721 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4722 PL_regoffs[n].end = -1;
4723 *PL_reglastparen = n;
4724 /*dmq: *PL_reglastcloseparen = n; */
4725 scan = ST.next_branch;
4726 /* no more branches? */
4727 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4729 PerlIO_printf( Perl_debug_log,
4730 "%*s %sBRANCH failed...%s\n",
4731 REPORT_CODE_OFF+depth*2, "",
4737 continue; /* execute next BRANCH[J] op */
4745 #define ST st->u.curlym
4747 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4749 /* This is an optimisation of CURLYX that enables us to push
4750 * only a single backtracking state, no matter how many matches
4751 * there are in {m,n}. It relies on the pattern being constant
4752 * length, with no parens to influence future backrefs
4756 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4758 /* if paren positive, emulate an OPEN/CLOSE around A */
4760 U32 paren = ST.me->flags;
4761 if (paren > PL_regsize)
4763 if (paren > *PL_reglastparen)
4764 *PL_reglastparen = paren;
4765 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4773 ST.c1 = CHRTEST_UNINIT;
4776 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4779 curlym_do_A: /* execute the A in /A{m,n}B/ */
4780 PL_reginput = locinput;
4781 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4784 case CURLYM_A: /* we've just matched an A */
4785 locinput = st->locinput;
4786 nextchr = UCHARAT(locinput);
4789 /* after first match, determine A's length: u.curlym.alen */
4790 if (ST.count == 1) {
4791 if (PL_reg_match_utf8) {
4793 while (s < PL_reginput) {
4799 ST.alen = PL_reginput - locinput;
4802 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4805 PerlIO_printf(Perl_debug_log,
4806 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4807 (int)(REPORT_CODE_OFF+(depth*2)), "",
4808 (IV) ST.count, (IV)ST.alen)
4811 locinput = PL_reginput;
4813 if (cur_eval && cur_eval->u.eval.close_paren &&
4814 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4818 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4819 if ( max == REG_INFTY || ST.count < max )
4820 goto curlym_do_A; /* try to match another A */
4822 goto curlym_do_B; /* try to match B */
4824 case CURLYM_A_fail: /* just failed to match an A */
4825 REGCP_UNWIND(ST.cp);
4827 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4828 || (cur_eval && cur_eval->u.eval.close_paren &&
4829 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4832 curlym_do_B: /* execute the B in /A{m,n}B/ */
4833 PL_reginput = locinput;
4834 if (ST.c1 == CHRTEST_UNINIT) {
4835 /* calculate c1 and c2 for possible match of 1st char
4836 * following curly */
4837 ST.c1 = ST.c2 = CHRTEST_VOID;
4838 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4839 regnode *text_node = ST.B;
4840 if (! HAS_TEXT(text_node))
4841 FIND_NEXT_IMPT(text_node);
4844 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4846 But the former is redundant in light of the latter.
4848 if this changes back then the macro for
4849 IS_TEXT and friends need to change.
4851 if (PL_regkind[OP(text_node)] == EXACT)
4854 ST.c1 = (U8)*STRING(text_node);
4856 (IS_TEXTF(text_node))
4858 : (IS_TEXTFL(text_node))
4859 ? PL_fold_locale[ST.c1]
4866 PerlIO_printf(Perl_debug_log,
4867 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4868 (int)(REPORT_CODE_OFF+(depth*2)),
4871 if (ST.c1 != CHRTEST_VOID
4872 && UCHARAT(PL_reginput) != ST.c1
4873 && UCHARAT(PL_reginput) != ST.c2)
4875 /* simulate B failing */
4877 PerlIO_printf(Perl_debug_log,
4878 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4879 (int)(REPORT_CODE_OFF+(depth*2)),"",
4882 state_num = CURLYM_B_fail;
4883 goto reenter_switch;
4887 /* mark current A as captured */
4888 I32 paren = ST.me->flags;
4890 PL_regoffs[paren].start
4891 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4892 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4893 /*dmq: *PL_reglastcloseparen = paren; */
4896 PL_regoffs[paren].end = -1;
4897 if (cur_eval && cur_eval->u.eval.close_paren &&
4898 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4907 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4910 case CURLYM_B_fail: /* just failed to match a B */
4911 REGCP_UNWIND(ST.cp);
4913 I32 max = ARG2(ST.me);
4914 if (max != REG_INFTY && ST.count == max)
4916 goto curlym_do_A; /* try to match a further A */
4918 /* backtrack one A */
4919 if (ST.count == ARG1(ST.me) /* min */)
4922 locinput = HOPc(locinput, -ST.alen);
4923 goto curlym_do_B; /* try to match B */
4926 #define ST st->u.curly
4928 #define CURLY_SETPAREN(paren, success) \
4931 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4932 PL_regoffs[paren].end = locinput - PL_bostr; \
4933 *PL_reglastcloseparen = paren; \
4936 PL_regoffs[paren].end = -1; \
4939 case STAR: /* /A*B/ where A is width 1 */
4943 scan = NEXTOPER(scan);
4945 case PLUS: /* /A+B/ where A is width 1 */
4949 scan = NEXTOPER(scan);
4951 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4952 ST.paren = scan->flags; /* Which paren to set */
4953 if (ST.paren > PL_regsize)
4954 PL_regsize = ST.paren;
4955 if (ST.paren > *PL_reglastparen)
4956 *PL_reglastparen = ST.paren;
4957 ST.min = ARG1(scan); /* min to match */
4958 ST.max = ARG2(scan); /* max to match */
4959 if (cur_eval && cur_eval->u.eval.close_paren &&
4960 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4964 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4966 case CURLY: /* /A{m,n}B/ where A is width 1 */
4968 ST.min = ARG1(scan); /* min to match */
4969 ST.max = ARG2(scan); /* max to match */
4970 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4973 * Lookahead to avoid useless match attempts
4974 * when we know what character comes next.
4976 * Used to only do .*x and .*?x, but now it allows
4977 * for )'s, ('s and (?{ ... })'s to be in the way
4978 * of the quantifier and the EXACT-like node. -- japhy
4981 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4983 if (HAS_TEXT(next) || JUMPABLE(next)) {
4985 regnode *text_node = next;
4987 if (! HAS_TEXT(text_node))
4988 FIND_NEXT_IMPT(text_node);
4990 if (! HAS_TEXT(text_node))
4991 ST.c1 = ST.c2 = CHRTEST_VOID;
4993 if ( PL_regkind[OP(text_node)] != EXACT ) {
4994 ST.c1 = ST.c2 = CHRTEST_VOID;
4995 goto assume_ok_easy;
4998 s = (U8*)STRING(text_node);
5000 /* Currently we only get here when
5002 PL_rekind[OP(text_node)] == EXACT
5004 if this changes back then the macro for IS_TEXT and
5005 friends need to change. */
5008 if (IS_TEXTF(text_node))
5009 ST.c2 = PL_fold[ST.c1];
5010 else if (IS_TEXTFL(text_node))
5011 ST.c2 = PL_fold_locale[ST.c1];
5013 else { /* UTF_PATTERN */
5014 if (IS_TEXTF(text_node)) {
5015 STRLEN ulen1, ulen2;
5016 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5017 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5019 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5020 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5022 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5024 0 : UTF8_ALLOW_ANY);
5025 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5027 0 : UTF8_ALLOW_ANY);
5029 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5031 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5036 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5043 ST.c1 = ST.c2 = CHRTEST_VOID;
5048 PL_reginput = locinput;
5051 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5054 locinput = PL_reginput;
5056 if (ST.c1 == CHRTEST_VOID)
5057 goto curly_try_B_min;
5059 ST.oldloc = locinput;
5061 /* set ST.maxpos to the furthest point along the
5062 * string that could possibly match */
5063 if (ST.max == REG_INFTY) {
5064 ST.maxpos = PL_regeol - 1;
5066 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5069 else if (utf8_target) {
5070 int m = ST.max - ST.min;
5071 for (ST.maxpos = locinput;
5072 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5073 ST.maxpos += UTF8SKIP(ST.maxpos);
5076 ST.maxpos = locinput + ST.max - ST.min;
5077 if (ST.maxpos >= PL_regeol)
5078 ST.maxpos = PL_regeol - 1;
5080 goto curly_try_B_min_known;
5084 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5085 locinput = PL_reginput;
5086 if (ST.count < ST.min)
5088 if ((ST.count > ST.min)
5089 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5091 /* A{m,n} must come at the end of the string, there's
5092 * no point in backing off ... */
5094 /* ...except that $ and \Z can match before *and* after
5095 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5096 We may back off by one in this case. */
5097 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5101 goto curly_try_B_max;
5106 case CURLY_B_min_known_fail:
5107 /* failed to find B in a non-greedy match where c1,c2 valid */
5108 if (ST.paren && ST.count)
5109 PL_regoffs[ST.paren].end = -1;
5111 PL_reginput = locinput; /* Could be reset... */
5112 REGCP_UNWIND(ST.cp);
5113 /* Couldn't or didn't -- move forward. */
5114 ST.oldloc = locinput;
5116 locinput += UTF8SKIP(locinput);
5120 curly_try_B_min_known:
5121 /* find the next place where 'B' could work, then call B */
5125 n = (ST.oldloc == locinput) ? 0 : 1;
5126 if (ST.c1 == ST.c2) {
5128 /* set n to utf8_distance(oldloc, locinput) */
5129 while (locinput <= ST.maxpos &&
5130 utf8n_to_uvchr((U8*)locinput,
5131 UTF8_MAXBYTES, &len,
5132 uniflags) != (UV)ST.c1) {
5138 /* set n to utf8_distance(oldloc, locinput) */
5139 while (locinput <= ST.maxpos) {
5141 const UV c = utf8n_to_uvchr((U8*)locinput,
5142 UTF8_MAXBYTES, &len,
5144 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5152 if (ST.c1 == ST.c2) {
5153 while (locinput <= ST.maxpos &&
5154 UCHARAT(locinput) != ST.c1)
5158 while (locinput <= ST.maxpos
5159 && UCHARAT(locinput) != ST.c1
5160 && UCHARAT(locinput) != ST.c2)
5163 n = locinput - ST.oldloc;
5165 if (locinput > ST.maxpos)
5167 /* PL_reginput == oldloc now */
5170 if (regrepeat(rex, ST.A, n, depth) < n)
5173 PL_reginput = locinput;
5174 CURLY_SETPAREN(ST.paren, ST.count);
5175 if (cur_eval && cur_eval->u.eval.close_paren &&
5176 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5179 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5184 case CURLY_B_min_fail:
5185 /* failed to find B in a non-greedy match where c1,c2 invalid */
5186 if (ST.paren && ST.count)
5187 PL_regoffs[ST.paren].end = -1;
5189 REGCP_UNWIND(ST.cp);
5190 /* failed -- move forward one */
5191 PL_reginput = locinput;
5192 if (regrepeat(rex, ST.A, 1, depth)) {
5194 locinput = PL_reginput;
5195 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5196 ST.count > 0)) /* count overflow ? */
5199 CURLY_SETPAREN(ST.paren, ST.count);
5200 if (cur_eval && cur_eval->u.eval.close_paren &&
5201 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5204 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5212 /* a successful greedy match: now try to match B */
5213 if (cur_eval && cur_eval->u.eval.close_paren &&
5214 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5219 if (ST.c1 != CHRTEST_VOID)
5220 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5221 UTF8_MAXBYTES, 0, uniflags)
5222 : (UV) UCHARAT(PL_reginput);
5223 /* If it could work, try it. */
5224 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5225 CURLY_SETPAREN(ST.paren, ST.count);
5226 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5231 case CURLY_B_max_fail:
5232 /* failed to find B in a greedy match */
5233 if (ST.paren && ST.count)
5234 PL_regoffs[ST.paren].end = -1;
5236 REGCP_UNWIND(ST.cp);
5238 if (--ST.count < ST.min)
5240 PL_reginput = locinput = HOPc(locinput, -1);
5241 goto curly_try_B_max;
5248 /* we've just finished A in /(??{A})B/; now continue with B */
5250 st->u.eval.toggle_reg_flags
5251 = cur_eval->u.eval.toggle_reg_flags;
5252 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5254 st->u.eval.prev_rex = rex_sv; /* inner */
5255 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5256 rex = (struct regexp *)SvANY(rex_sv);
5257 rexi = RXi_GET(rex);
5258 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5259 ReREFCNT_inc(rex_sv);
5260 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5262 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5263 PL_reglastparen = &rex->lastparen;
5264 PL_reglastcloseparen = &rex->lastcloseparen;
5266 REGCP_SET(st->u.eval.lastcp);
5267 PL_reginput = locinput;
5269 /* Restore parens of the outer rex without popping the
5271 tmpix = PL_savestack_ix;
5272 PL_savestack_ix = cur_eval->u.eval.lastcp;
5274 PL_savestack_ix = tmpix;
5276 st->u.eval.prev_eval = cur_eval;
5277 cur_eval = cur_eval->u.eval.prev_eval;
5279 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5280 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5281 if ( nochange_depth )
5284 PUSH_YES_STATE_GOTO(EVAL_AB,
5285 st->u.eval.prev_eval->u.eval.B); /* match B */
5288 if (locinput < reginfo->till) {
5289 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5290 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5292 (long)(locinput - PL_reg_starttry),
5293 (long)(reginfo->till - PL_reg_starttry),
5296 sayNO_SILENT; /* Cannot match: too short. */
5298 PL_reginput = locinput; /* put where regtry can find it */
5299 sayYES; /* Success! */
5301 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5303 PerlIO_printf(Perl_debug_log,
5304 "%*s %ssubpattern success...%s\n",
5305 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5306 PL_reginput = locinput; /* put where regtry can find it */
5307 sayYES; /* Success! */
5310 #define ST st->u.ifmatch
5312 case SUSPEND: /* (?>A) */
5314 PL_reginput = locinput;
5317 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5319 goto ifmatch_trivial_fail_test;
5321 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5323 ifmatch_trivial_fail_test:
5325 char * const s = HOPBACKc(locinput, scan->flags);
5330 sw = 1 - cBOOL(ST.wanted);
5334 next = scan + ARG(scan);
5342 PL_reginput = locinput;
5346 ST.logical = logical;
5347 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5349 /* execute body of (?...A) */
5350 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5353 case IFMATCH_A_fail: /* body of (?...A) failed */
5354 ST.wanted = !ST.wanted;
5357 case IFMATCH_A: /* body of (?...A) succeeded */
5359 sw = cBOOL(ST.wanted);
5361 else if (!ST.wanted)
5364 if (OP(ST.me) == SUSPEND)
5365 locinput = PL_reginput;
5367 locinput = PL_reginput = st->locinput;
5368 nextchr = UCHARAT(locinput);
5370 scan = ST.me + ARG(ST.me);
5373 continue; /* execute B */
5378 next = scan + ARG(scan);
5383 reginfo->cutpoint = PL_regeol;
5386 PL_reginput = locinput;
5388 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5389 PUSH_STATE_GOTO(COMMIT_next,next);
5391 case COMMIT_next_fail:
5398 #define ST st->u.mark
5400 ST.prev_mark = mark_state;
5401 ST.mark_name = sv_commit = sv_yes_mark
5402 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5404 ST.mark_loc = PL_reginput = locinput;
5405 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5407 case MARKPOINT_next:
5408 mark_state = ST.prev_mark;
5411 case MARKPOINT_next_fail:
5412 if (popmark && sv_eq(ST.mark_name,popmark))
5414 if (ST.mark_loc > startpoint)
5415 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5416 popmark = NULL; /* we found our mark */
5417 sv_commit = ST.mark_name;
5420 PerlIO_printf(Perl_debug_log,
5421 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5422 REPORT_CODE_OFF+depth*2, "",
5423 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5426 mark_state = ST.prev_mark;
5427 sv_yes_mark = mark_state ?
5428 mark_state->u.mark.mark_name : NULL;
5432 PL_reginput = locinput;
5434 /* (*SKIP) : if we fail we cut here*/
5435 ST.mark_name = NULL;
5436 ST.mark_loc = locinput;
5437 PUSH_STATE_GOTO(SKIP_next,next);
5439 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5440 otherwise do nothing. Meaning we need to scan
5442 regmatch_state *cur = mark_state;
5443 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5446 if ( sv_eq( cur->u.mark.mark_name,
5449 ST.mark_name = find;
5450 PUSH_STATE_GOTO( SKIP_next, next );
5452 cur = cur->u.mark.prev_mark;
5455 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5457 case SKIP_next_fail:
5459 /* (*CUT:NAME) - Set up to search for the name as we
5460 collapse the stack*/
5461 popmark = ST.mark_name;
5463 /* (*CUT) - No name, we cut here.*/
5464 if (ST.mark_loc > startpoint)
5465 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5466 /* but we set sv_commit to latest mark_name if there
5467 is one so they can test to see how things lead to this
5470 sv_commit=mark_state->u.mark.mark_name;
5478 if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5480 } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5483 U8 folded[UTF8_MAXBYTES_CASE+1];
5485 const char * const l = locinput;
5486 char *e = PL_regeol;
5487 to_uni_fold(n, folded, &foldlen);
5489 if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1,
5490 l, &e, 0, utf8_target)) {
5495 nextchr = UCHARAT(locinput);
5498 if ((n=is_LNBREAK(locinput,utf8_target))) {
5500 nextchr = UCHARAT(locinput);
5505 #define CASE_CLASS(nAmE) \
5507 if ((n=is_##nAmE(locinput,utf8_target))) { \
5509 nextchr = UCHARAT(locinput); \
5514 if ((n=is_##nAmE(locinput,utf8_target))) { \
5517 locinput += UTF8SKIP(locinput); \
5518 nextchr = UCHARAT(locinput); \
5523 CASE_CLASS(HORIZWS);
5527 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5528 PTR2UV(scan), OP(scan));
5529 Perl_croak(aTHX_ "regexp memory corruption");
5533 /* switch break jumps here */
5534 scan = next; /* prepare to execute the next op and ... */
5535 continue; /* ... jump back to the top, reusing st */
5539 /* push a state that backtracks on success */
5540 st->u.yes.prev_yes_state = yes_state;
5544 /* push a new regex state, then continue at scan */
5546 regmatch_state *newst;
5549 regmatch_state *cur = st;
5550 regmatch_state *curyes = yes_state;
5552 regmatch_slab *slab = PL_regmatch_slab;
5553 for (;curd > -1;cur--,curd--) {
5554 if (cur < SLAB_FIRST(slab)) {
5556 cur = SLAB_LAST(slab);
5558 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5559 REPORT_CODE_OFF + 2 + depth * 2,"",
5560 curd, PL_reg_name[cur->resume_state],
5561 (curyes == cur) ? "yes" : ""
5564 curyes = cur->u.yes.prev_yes_state;
5567 DEBUG_STATE_pp("push")
5570 st->locinput = locinput;
5572 if (newst > SLAB_LAST(PL_regmatch_slab))
5573 newst = S_push_slab(aTHX);
5574 PL_regmatch_state = newst;
5576 locinput = PL_reginput;
5577 nextchr = UCHARAT(locinput);
5585 * We get here only if there's trouble -- normally "case END" is
5586 * the terminating point.
5588 Perl_croak(aTHX_ "corrupted regexp pointers");
5594 /* we have successfully completed a subexpression, but we must now
5595 * pop to the state marked by yes_state and continue from there */
5596 assert(st != yes_state);
5598 while (st != yes_state) {
5600 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5601 PL_regmatch_slab = PL_regmatch_slab->prev;
5602 st = SLAB_LAST(PL_regmatch_slab);
5606 DEBUG_STATE_pp("pop (no final)");
5608 DEBUG_STATE_pp("pop (yes)");
5614 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5615 || yes_state > SLAB_LAST(PL_regmatch_slab))
5617 /* not in this slab, pop slab */
5618 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5619 PL_regmatch_slab = PL_regmatch_slab->prev;
5620 st = SLAB_LAST(PL_regmatch_slab);
5622 depth -= (st - yes_state);
5625 yes_state = st->u.yes.prev_yes_state;
5626 PL_regmatch_state = st;
5629 locinput= st->locinput;
5630 nextchr = UCHARAT(locinput);
5632 state_num = st->resume_state + no_final;
5633 goto reenter_switch;
5636 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5637 PL_colors[4], PL_colors[5]));
5639 if (PL_reg_eval_set) {
5640 /* each successfully executed (?{...}) block does the equivalent of
5641 * local $^R = do {...}
5642 * When popping the save stack, all these locals would be undone;
5643 * bypass this by setting the outermost saved $^R to the latest
5645 if (oreplsv != GvSV(PL_replgv))
5646 sv_setsv(oreplsv, GvSV(PL_replgv));
5653 PerlIO_printf(Perl_debug_log,
5654 "%*s %sfailed...%s\n",
5655 REPORT_CODE_OFF+depth*2, "",
5656 PL_colors[4], PL_colors[5])
5668 /* there's a previous state to backtrack to */
5670 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5671 PL_regmatch_slab = PL_regmatch_slab->prev;
5672 st = SLAB_LAST(PL_regmatch_slab);
5674 PL_regmatch_state = st;
5675 locinput= st->locinput;
5676 nextchr = UCHARAT(locinput);
5678 DEBUG_STATE_pp("pop");
5680 if (yes_state == st)
5681 yes_state = st->u.yes.prev_yes_state;
5683 state_num = st->resume_state + 1; /* failure = success + 1 */
5684 goto reenter_switch;
5689 if (rex->intflags & PREGf_VERBARG_SEEN) {
5690 SV *sv_err = get_sv("REGERROR", 1);
5691 SV *sv_mrk = get_sv("REGMARK", 1);
5693 sv_commit = &PL_sv_no;
5695 sv_yes_mark = &PL_sv_yes;
5698 sv_commit = &PL_sv_yes;
5699 sv_yes_mark = &PL_sv_no;
5701 sv_setsv(sv_err, sv_commit);
5702 sv_setsv(sv_mrk, sv_yes_mark);
5705 /* clean up; in particular, free all slabs above current one */
5706 LEAVE_SCOPE(oldsave);
5712 - regrepeat - repeatedly match something simple, report how many
5715 * [This routine now assumes that it will only match on things of length 1.
5716 * That was true before, but now we assume scan - reginput is the count,
5717 * rather than incrementing count on every character. [Er, except utf8.]]
5720 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5723 register char *scan;
5725 register char *loceol = PL_regeol;
5726 register I32 hardcount = 0;
5727 register bool utf8_target = PL_reg_match_utf8;
5729 PERL_UNUSED_ARG(depth);
5732 PERL_ARGS_ASSERT_REGREPEAT;
5735 if (max == REG_INFTY)
5737 else if (max < loceol - scan)
5738 loceol = scan + max;
5743 while (scan < loceol && hardcount < max && *scan != '\n') {
5744 scan += UTF8SKIP(scan);
5748 while (scan < loceol && *scan != '\n')
5755 while (scan < loceol && hardcount < max) {
5756 scan += UTF8SKIP(scan);
5767 PL_reg_flags |= RF_tainted;
5771 /* To get here, EXACTish nodes must have *byte* length == 1. That means
5772 * they match only characters in the string that can be expressed as a
5773 * single byte. For non-utf8 strings, that means a simple match. For
5774 * utf8 strings, the character matched must be an invariant, or
5775 * downgradable to a single byte. The pattern's utf8ness is
5776 * irrelevant, as it must be a single byte, so either it isn't utf8, or
5777 * if it is it's an invariant */
5780 assert(! UTF_PATTERN || UNI_IS_INVARIANT(c));
5782 if ((! utf8_target) || UNI_IS_INVARIANT(c)) {
5784 /* Here, the string isn't utf8, or the character in the EXACT
5785 * node is the same in utf8 as not, so can just do equality.
5786 * Each matching char must be 1 byte long */
5789 while (scan < loceol && UCHARAT(scan) == c) {
5794 while (scan < loceol &&
5795 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5801 while (scan < loceol &&
5802 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5808 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5813 /* Here, the string is utf8, and the pattern char is different
5814 * in utf8 than not. */
5819 /* Fastest to find the two utf8 bytes that represent c, and
5820 * then look for those in sequence in the utf8 string */
5821 U8 high = UTF8_TWO_BYTE_HI(c);
5822 U8 low = UTF8_TWO_BYTE_LO(c);
5825 while (hardcount < max
5826 && scan + 1 < loceol
5827 && UCHARAT(scan) == high
5828 && UCHARAT(scan + 1) == low)
5835 case EXACTFL: /* Doesn't really make sense, but is best we can
5836 do. The documents warn against mixing locale
5839 { /* utf8 string, so use utf8 foldEQ */
5840 char *tmpeol = loceol;
5841 while (hardcount < max
5842 && foldEQ_utf8(scan, &tmpeol, 0, utf8_target,
5843 STRING(p), NULL, 1, UTF_PATTERN))
5850 /* XXX Note that the above handles properly the German
5851 * sharp ss in the pattern matching ss in the string. But
5852 * it doesn't handle properly cases where the string
5853 * contains say 'LIGATURE ff' and the pattern is 'f+'.
5854 * This would require, say, a new function or revised
5855 * interface to foldEQ_utf8(), in which the maximum number
5856 * of characters to match could be passed and it would
5857 * return how many actually did. This is just one of many
5858 * cases where multi-char folds don't work properly, and so
5859 * the fix is being deferred */
5863 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
5870 while (hardcount < max && scan < loceol &&
5871 reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5872 scan += UTF8SKIP(scan);
5876 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5883 LOAD_UTF8_CHARCLASS_ALNUM();
5884 while (hardcount < max && scan < loceol &&
5885 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5887 scan += UTF8SKIP(scan);
5890 } else if (FLAGS(p) & USE_UNI) {
5891 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5895 while (scan < loceol && isALNUM((U8) *scan)) {
5901 PL_reg_flags |= RF_tainted;
5904 while (hardcount < max && scan < loceol &&
5905 isALNUM_LC_utf8((U8*)scan)) {
5906 scan += UTF8SKIP(scan);
5910 while (scan < loceol && isALNUM_LC(*scan))
5917 LOAD_UTF8_CHARCLASS_ALNUM();
5918 while (hardcount < max && scan < loceol &&
5919 !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5921 scan += UTF8SKIP(scan);
5924 } else if (FLAGS(p) & USE_UNI) {
5925 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5929 while (scan < loceol && ! isALNUM((U8) *scan)) {
5935 PL_reg_flags |= RF_tainted;
5938 while (hardcount < max && scan < loceol &&
5939 !isALNUM_LC_utf8((U8*)scan)) {
5940 scan += UTF8SKIP(scan);
5944 while (scan < loceol && !isALNUM_LC(*scan))
5951 LOAD_UTF8_CHARCLASS_SPACE();
5952 while (hardcount < max && scan < loceol &&
5954 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5956 scan += UTF8SKIP(scan);
5959 } else if (FLAGS(p) & USE_UNI) {
5960 while (scan < loceol && isSPACE_L1((U8) *scan)) {
5964 while (scan < loceol && isSPACE((U8) *scan))
5969 PL_reg_flags |= RF_tainted;
5972 while (hardcount < max && scan < loceol &&
5973 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5974 scan += UTF8SKIP(scan);
5978 while (scan < loceol && isSPACE_LC(*scan))
5985 LOAD_UTF8_CHARCLASS_SPACE();
5986 while (hardcount < max && scan < loceol &&
5988 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5990 scan += UTF8SKIP(scan);
5993 } else if (FLAGS(p) & USE_UNI) {
5994 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
5998 while (scan < loceol && ! isSPACE((U8) *scan)) {
6004 PL_reg_flags |= RF_tainted;
6007 while (hardcount < max && scan < loceol &&
6008 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
6009 scan += UTF8SKIP(scan);
6013 while (scan < loceol && !isSPACE_LC(*scan))
6020 LOAD_UTF8_CHARCLASS_DIGIT();
6021 while (hardcount < max && scan < loceol &&
6022 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6023 scan += UTF8SKIP(scan);
6027 while (scan < loceol && isDIGIT(*scan))
6034 LOAD_UTF8_CHARCLASS_DIGIT();
6035 while (hardcount < max && scan < loceol &&
6036 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
6037 scan += UTF8SKIP(scan);
6041 while (scan < loceol && !isDIGIT(*scan))
6047 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
6053 LNBREAK can match two latin chars, which is ok,
6054 because we have a null terminated string, but we
6055 have to use hardcount in this situation
6057 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
6066 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
6071 while (scan < loceol && is_HORIZWS_latin1(scan))
6078 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
6079 scan += UTF8SKIP(scan);
6083 while (scan < loceol && !is_HORIZWS_latin1(scan))
6091 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6096 while (scan < loceol && is_VERTWS_latin1(scan))
6104 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6105 scan += UTF8SKIP(scan);
6109 while (scan < loceol && !is_VERTWS_latin1(scan))
6115 default: /* Called on something of 0 width. */
6116 break; /* So match right here or not at all. */
6122 c = scan - PL_reginput;
6126 GET_RE_DEBUG_FLAGS_DECL;
6128 SV * const prop = sv_newmortal();
6129 regprop(prog, prop, p);
6130 PerlIO_printf(Perl_debug_log,
6131 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
6132 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6140 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6142 - regclass_swash - prepare the utf8 swash
6146 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6152 RXi_GET_DECL(prog,progi);
6153 const struct reg_data * const data = prog ? progi->data : NULL;
6155 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6157 if (data && data->count) {
6158 const U32 n = ARG(node);
6160 if (data->what[n] == 's') {
6161 SV * const rv = MUTABLE_SV(data->data[n]);
6162 AV * const av = MUTABLE_AV(SvRV(rv));
6163 SV **const ary = AvARRAY(av);
6166 /* See the end of regcomp.c:S_regclass() for
6167 * documentation of these array elements. */
6170 a = SvROK(ary[1]) ? &ary[1] : NULL;
6171 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6175 else if (si && doinit) {
6176 sw = swash_init("utf8", "", si, 1, 0);
6177 (void)av_store(av, 1, sw);
6194 - reginclass - determine if a character falls into a character class
6196 n is the ANYOF regnode
6197 p is the target string
6198 lenp is pointer to the maximum number of bytes of how far to go in p
6199 (This is assumed wthout checking to always be at least the current
6201 utf8_target tells whether p is in UTF-8.
6203 Returns true if matched; false otherwise. If lenp is not NULL, on return
6204 from a successful match, the value it points to will be updated to how many
6205 bytes in p were matched. If there was no match, the value is undefined,
6206 possibly changed from the input.
6211 S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target)
6214 const char flags = ANYOF_FLAGS(n);
6220 PERL_ARGS_ASSERT_REGINCLASS;
6222 /* If c is not already the code point, get it */
6223 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6224 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len,
6225 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6226 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6227 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6228 * UTF8_ALLOW_FFFF */
6229 if (c_len == (STRLEN)-1)
6230 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6236 /* Use passed in max length, or one character if none passed in or less
6237 * than one character. And assume will match just one character. This is
6238 * overwritten later if matched more. */
6240 maxlen = (*lenp > c_len) ? *lenp : c_len;
6248 /* If this character is potentially in the bitmap, check it */
6250 if (ANYOF_BITMAP_TEST(n, c))
6252 else if (flags & ANYOF_FOLD) {
6255 if (flags & ANYOF_LOCALE) {
6256 PL_reg_flags |= RF_tainted;
6257 f = PL_fold_locale[c];
6261 if (f != c && ANYOF_BITMAP_TEST(n, f))
6265 if (!match && (flags & ANYOF_CLASS)) {
6266 PL_reg_flags |= RF_tainted;
6268 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6269 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6270 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6271 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6272 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6273 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6274 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6275 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6276 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6277 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6278 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
6279 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
6280 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6281 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6282 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6283 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6284 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6285 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6286 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6287 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6288 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6289 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6290 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6291 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6292 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6293 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6294 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6295 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6296 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
6297 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
6298 ) /* How's that for a conditional? */
6305 /* If the bitmap didn't (or couldn't) match, and something outside the
6306 * bitmap could match, try that */
6307 if (!match && (utf8_target || (flags & ANYOF_UNICODE))) {
6308 if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
6313 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6321 utf8_p = bytes_to_utf8(p, &len);
6323 if (swash_fetch(sw, utf8_p, 1))
6325 else if (flags & ANYOF_FOLD) {
6326 if (!match && lenp && av) {
6328 for (i = 0; i <= av_len(av); i++) {
6329 SV* const sv = *av_fetch(av, i, FALSE);
6331 const char * const s = SvPV_const(sv, len);
6332 if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) {
6340 U8 folded[UTF8_MAXBYTES_CASE+1];
6342 /* See if the folded version matches */
6344 to_utf8_fold(utf8_p, folded, &foldlen);
6345 if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */
6351 /* Consider "k" =~ /[K]/i. The line above would
6352 * have just folded the 'k' to itself, and that
6353 * isn't going to match 'K'. So we look through
6354 * the closure of everything that folds to 'k'.
6355 * That will find the 'K'. Initialize the list, if
6357 if (! PL_utf8_foldclosures) {
6359 /* If the folds haven't been read in, call a fold
6360 * function to force that */
6361 if (! PL_utf8_tofold) {
6362 U8 dummy[UTF8_MAXBYTES+1];
6364 to_utf8_fold((U8*) "A", dummy, &dummy_len);
6366 PL_utf8_foldclosures =
6367 _swash_inversion_hash(PL_utf8_tofold);
6370 /* The data structure is a hash with the keys every
6371 * character that is folded to, like 'k', and the
6372 * values each an array of everything that folds to
6373 * its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
6374 if ((listp = hv_fetch(PL_utf8_foldclosures,
6375 (char *) folded, foldlen, FALSE)))
6377 AV* list = (AV*) *listp;
6379 for (i = 0; i <= av_len(list); i++) {
6380 SV** try_p = av_fetch(list, i, FALSE);
6381 if (try_p == NULL) {
6382 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
6384 /* Don't have to worry about embeded nulls
6385 * since NULL isn't folded or foldable */
6386 if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
6396 /* If we allocated a string above, free it */
6397 if (! utf8_target) Safefree(utf8_p);
6402 return (flags & ANYOF_INVERT) ? !match : match;
6406 S_reghop3(U8 *s, I32 off, const U8* lim)
6410 PERL_ARGS_ASSERT_REGHOP3;
6413 while (off-- && s < lim) {
6414 /* XXX could check well-formedness here */
6419 while (off++ && s > lim) {
6421 if (UTF8_IS_CONTINUED(*s)) {
6422 while (s > lim && UTF8_IS_CONTINUATION(*s))
6425 /* XXX could check well-formedness here */
6432 /* there are a bunch of places where we use two reghop3's that should
6433 be replaced with this routine. but since thats not done yet
6434 we ifdef it out - dmq
6437 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6441 PERL_ARGS_ASSERT_REGHOP4;
6444 while (off-- && s < rlim) {
6445 /* XXX could check well-formedness here */
6450 while (off++ && s > llim) {
6452 if (UTF8_IS_CONTINUED(*s)) {
6453 while (s > llim && UTF8_IS_CONTINUATION(*s))
6456 /* XXX could check well-formedness here */
6464 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6468 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6471 while (off-- && s < lim) {
6472 /* XXX could check well-formedness here */
6479 while (off++ && s > lim) {
6481 if (UTF8_IS_CONTINUED(*s)) {
6482 while (s > lim && UTF8_IS_CONTINUATION(*s))
6485 /* XXX could check well-formedness here */
6494 restore_pos(pTHX_ void *arg)
6497 regexp * const rex = (regexp *)arg;
6498 if (PL_reg_eval_set) {
6499 if (PL_reg_oldsaved) {
6500 rex->subbeg = PL_reg_oldsaved;
6501 rex->sublen = PL_reg_oldsavedlen;
6502 #ifdef PERL_OLD_COPY_ON_WRITE
6503 rex->saved_copy = PL_nrs;
6505 RXp_MATCH_COPIED_on(rex);
6507 PL_reg_magic->mg_len = PL_reg_oldpos;
6508 PL_reg_eval_set = 0;
6509 PL_curpm = PL_reg_oldcurpm;
6514 S_to_utf8_substr(pTHX_ register regexp *prog)
6518 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6521 if (prog->substrs->data[i].substr
6522 && !prog->substrs->data[i].utf8_substr) {
6523 SV* const sv = newSVsv(prog->substrs->data[i].substr);
6524 prog->substrs->data[i].utf8_substr = sv;
6525 sv_utf8_upgrade(sv);
6526 if (SvVALID(prog->substrs->data[i].substr)) {
6527 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6528 if (flags & FBMcf_TAIL) {
6529 /* Trim the trailing \n that fbm_compile added last
6531 SvCUR_set(sv, SvCUR(sv) - 1);
6532 /* Whilst this makes the SV technically "invalid" (as its
6533 buffer is no longer followed by "\0") when fbm_compile()
6534 adds the "\n" back, a "\0" is restored. */
6536 fbm_compile(sv, flags);
6538 if (prog->substrs->data[i].substr == prog->check_substr)
6539 prog->check_utf8 = sv;
6545 S_to_byte_substr(pTHX_ register regexp *prog)
6550 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6553 if (prog->substrs->data[i].utf8_substr
6554 && !prog->substrs->data[i].substr) {
6555 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6556 if (sv_utf8_downgrade(sv, TRUE)) {
6557 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6559 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6560 if (flags & FBMcf_TAIL) {
6561 /* Trim the trailing \n that fbm_compile added last
6563 SvCUR_set(sv, SvCUR(sv) - 1);
6565 fbm_compile(sv, flags);
6571 prog->substrs->data[i].substr = sv;
6572 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6573 prog->check_substr = sv;
6580 * c-indentation-style: bsd
6582 * indent-tabs-mode: t
6585 * ex: set ts=8 sts=4 sw=4 noet: