5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
78 #ifdef PERL_IN_XSUB_RE
84 #define RF_tainted 1 /* tainted information used? */
85 #define RF_warned 2 /* warned about big count? */
87 #define RF_utf8 8 /* Pattern contains multibyte chars? */
89 #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0)
91 #define RS_init 1 /* eval environment created */
92 #define RS_set 2 /* replsv value is set */
98 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c)))
104 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
105 #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b)
107 #define HOPc(pos,off) \
108 (char *)(PL_reg_match_utf8 \
109 ? reghop3((U8*)pos, off, (U8*)(off >= 0 ? PL_regeol : PL_bostr)) \
111 #define HOPBACKc(pos, off) \
112 (char*)(PL_reg_match_utf8\
113 ? reghopmaybe3((U8*)pos, -off, (U8*)PL_bostr) \
114 : (pos - off >= PL_bostr) \
118 #define HOP3(pos,off,lim) (PL_reg_match_utf8 ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
119 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
121 /* these are unrolled below in the CCC_TRY_XXX defined */
122 #define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \
123 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END
125 /* Doesn't do an assert to verify that is correct */
126 #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \
127 if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END
129 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a")
130 #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0")
131 #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ")
133 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
134 LOAD_UTF8_CHARCLASS(X_begin, " "); \
135 LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \
136 /* These are utf8 constants, and not utf-ebcdic constants, so the \
137 * assert should likely and hopefully fail on an EBCDIC machine */ \
138 LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \
140 /* No asserts are done for these, in case called on an early \
141 * Unicode version in which they map to nothing */ \
142 LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \
143 LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \
144 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \
145 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \
146 LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\
147 LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \
148 LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */
151 We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
152 so that it is possible to override the option here without having to
153 rebuild the entire core. as we are required to do if we change regcomp.h
154 which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
156 #if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
157 #define BROKEN_UNICODE_CHARCLASS_MAPPINGS
160 #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
161 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM()
162 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE()
163 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT()
164 #define RE_utf8_perl_word PL_utf8_alnum
165 #define RE_utf8_perl_space PL_utf8_space
166 #define RE_utf8_posix_digit PL_utf8_digit
167 #define perl_word alnum
168 #define perl_space space
169 #define posix_digit digit
171 #define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a")
172 #define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ")
173 #define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0")
174 #define RE_utf8_perl_word PL_utf8_perl_word
175 #define RE_utf8_perl_space PL_utf8_perl_space
176 #define RE_utf8_posix_digit PL_utf8_posix_digit
180 #define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
182 PL_reg_flags |= RF_tainted; \
187 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
188 if (!CAT2(PL_utf8_,CLASS)) { \
192 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
196 if (!(OP(scan) == NAME \
197 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
198 : LCFUNC_utf8((U8*)locinput))) \
202 locinput += PL_utf8skip[nextchr]; \
203 nextchr = UCHARAT(locinput); \
206 /* Drops through to the macro that calls this one */
208 #define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
209 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
210 if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
212 nextchr = UCHARAT(++locinput); \
215 /* Almost identical to the above, but has a case for a node that matches chars
216 * between 128 and 255 using Unicode (latin1) semantics. */
217 #define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
218 _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
219 if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
221 nextchr = UCHARAT(++locinput); \
224 #define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
226 PL_reg_flags |= RF_tainted; \
229 if (!nextchr && locinput >= PL_regeol) \
231 if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \
232 if (!CAT2(PL_utf8_,CLASS)) { \
236 ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \
240 if ((OP(scan) == NAME \
241 ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \
242 : LCFUNC_utf8((U8*)locinput))) \
246 locinput += PL_utf8skip[nextchr]; \
247 nextchr = UCHARAT(locinput); \
251 #define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \
252 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \
253 if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \
255 nextchr = UCHARAT(++locinput); \
259 #define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \
260 _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \
261 if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \
263 nextchr = UCHARAT(++locinput); \
268 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
270 /* for use after a quantifier and before an EXACT-like node -- japhy */
271 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
273 * NOTE that *nothing* that affects backtracking should be in here, specifically
274 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
275 * node that is in between two EXACT like nodes when ascertaining what the required
276 * "follow" character is. This should probably be moved to regex compile time
277 * although it may be done at run time beause of the REF possibility - more
278 * investigation required. -- demerphq
280 #define JUMPABLE(rn) ( \
282 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
284 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
285 OP(rn) == PLUS || OP(rn) == MINMOD || \
287 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
289 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
291 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
294 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
295 we don't need this definition. */
296 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
297 #define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
298 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
301 /* ... so we use this as its faster. */
302 #define IS_TEXT(rn) ( OP(rn)==EXACT )
303 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
304 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
309 Search for mandatory following text node; for lookahead, the text must
310 follow but for lookbehind (rn->flags != 0) we skip to the next step.
312 #define FIND_NEXT_IMPT(rn) STMT_START { \
313 while (JUMPABLE(rn)) { \
314 const OPCODE type = OP(rn); \
315 if (type == SUSPEND || PL_regkind[type] == CURLY) \
316 rn = NEXTOPER(NEXTOPER(rn)); \
317 else if (type == PLUS) \
319 else if (type == IFMATCH) \
320 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
321 else rn += NEXT_OFF(rn); \
326 static void restore_pos(pTHX_ void *arg);
328 #define REGCP_PAREN_ELEMS 4
329 #define REGCP_OTHER_ELEMS 5
330 #define REGCP_FRAME_ELEMS 1
331 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
332 * are needed for the regexp context stack bookkeeping. */
335 S_regcppush(pTHX_ I32 parenfloor)
338 const int retval = PL_savestack_ix;
339 const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS;
340 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
341 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
343 GET_RE_DEBUG_FLAGS_DECL;
345 if (paren_elems_to_push < 0)
346 Perl_croak(aTHX_ "panic: paren_elems_to_push < 0");
348 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
349 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
350 " out of range (%lu-%ld)",
351 total_elems, (unsigned long)PL_regsize, (long)parenfloor);
353 SSGROW(total_elems + REGCP_FRAME_ELEMS);
355 for (p = PL_regsize; p > parenfloor; p--) {
356 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
357 SSPUSHINT(PL_regoffs[p].end);
358 SSPUSHINT(PL_regoffs[p].start);
359 SSPUSHPTR(PL_reg_start_tmp[p]);
361 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
362 " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n",
363 (UV)p, (IV)PL_regoffs[p].start,
364 (IV)(PL_reg_start_tmp[p] - PL_bostr),
365 (IV)PL_regoffs[p].end
368 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
369 SSPUSHPTR(PL_regoffs);
370 SSPUSHINT(PL_regsize);
371 SSPUSHINT(*PL_reglastparen);
372 SSPUSHINT(*PL_reglastcloseparen);
373 SSPUSHPTR(PL_reginput);
374 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
379 /* These are needed since we do not localize EVAL nodes: */
380 #define REGCP_SET(cp) \
382 PerlIO_printf(Perl_debug_log, \
383 " Setting an EVAL scope, savestack=%"IVdf"\n", \
384 (IV)PL_savestack_ix)); \
387 #define REGCP_UNWIND(cp) \
389 if (cp != PL_savestack_ix) \
390 PerlIO_printf(Perl_debug_log, \
391 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
392 (IV)(cp), (IV)PL_savestack_ix)); \
396 S_regcppop(pTHX_ const regexp *rex)
401 GET_RE_DEBUG_FLAGS_DECL;
403 PERL_ARGS_ASSERT_REGCPPOP;
405 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
407 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
408 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
409 input = (char *) SSPOPPTR;
410 *PL_reglastcloseparen = SSPOPINT;
411 *PL_reglastparen = SSPOPINT;
412 PL_regsize = SSPOPINT;
413 PL_regoffs=(regexp_paren_pair *) SSPOPPTR;
415 i -= REGCP_OTHER_ELEMS;
416 /* Now restore the parentheses context. */
417 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
419 U32 paren = (U32)SSPOPINT;
420 PL_reg_start_tmp[paren] = (char *) SSPOPPTR;
421 PL_regoffs[paren].start = SSPOPINT;
423 if (paren <= *PL_reglastparen)
424 PL_regoffs[paren].end = tmps;
426 PerlIO_printf(Perl_debug_log,
427 " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n",
428 (UV)paren, (IV)PL_regoffs[paren].start,
429 (IV)(PL_reg_start_tmp[paren] - PL_bostr),
430 (IV)PL_regoffs[paren].end,
431 (paren > *PL_reglastparen ? "(no)" : ""));
435 if (*PL_reglastparen + 1 <= rex->nparens) {
436 PerlIO_printf(Perl_debug_log,
437 " restoring \\%"IVdf"..\\%"IVdf" to undef\n",
438 (IV)(*PL_reglastparen + 1), (IV)rex->nparens);
442 /* It would seem that the similar code in regtry()
443 * already takes care of this, and in fact it is in
444 * a better location to since this code can #if 0-ed out
445 * but the code in regtry() is needed or otherwise tests
446 * requiring null fields (pat.t#187 and split.t#{13,14}
447 * (as of patchlevel 7877) will fail. Then again,
448 * this code seems to be necessary or otherwise
449 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
450 * --jhi updated by dapm */
451 for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) {
453 PL_regoffs[i].start = -1;
454 PL_regoffs[i].end = -1;
460 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
463 * pregexec and friends
466 #ifndef PERL_IN_XSUB_RE
468 - pregexec - match a regexp against a string
471 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend,
472 char *strbeg, I32 minend, SV *screamer, U32 nosave)
473 /* strend: pointer to null at end of string */
474 /* strbeg: real beginning of string */
475 /* minend: end of match must be >=minend after stringarg. */
476 /* nosave: For optimizations. */
478 PERL_ARGS_ASSERT_PREGEXEC;
481 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
482 nosave ? 0 : REXEC_COPY_STR);
487 * Need to implement the following flags for reg_anch:
489 * USE_INTUIT_NOML - Useful to call re_intuit_start() first
491 * INTUIT_AUTORITATIVE_NOML - Can trust a positive answer
492 * INTUIT_AUTORITATIVE_ML
493 * INTUIT_ONCE_NOML - Intuit can match in one location only.
496 * Another flag for this function: SECOND_TIME (so that float substrs
497 * with giant delta may be not rechecked).
500 /* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */
502 /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend.
503 Otherwise, only SvCUR(sv) is used to get strbeg. */
505 /* XXXX We assume that strpos is strbeg unless sv. */
507 /* XXXX Some places assume that there is a fixed substring.
508 An update may be needed if optimizer marks as "INTUITable"
509 RExen without fixed substrings. Similarly, it is assumed that
510 lengths of all the strings are no more than minlen, thus they
511 cannot come from lookahead.
512 (Or minlen should take into account lookahead.)
513 NOTE: Some of this comment is not correct. minlen does now take account
514 of lookahead/behind. Further research is required. -- demerphq
518 /* A failure to find a constant substring means that there is no need to make
519 an expensive call to REx engine, thus we celebrate a failure. Similarly,
520 finding a substring too deep into the string means that less calls to
521 regtry() should be needed.
523 REx compiler's optimizer found 4 possible hints:
524 a) Anchored substring;
526 c) Whether we are anchored (beginning-of-line or \G);
527 d) First node (of those at offset 0) which may distingush positions;
528 We use a)b)d) and multiline-part of c), and try to find a position in the
529 string which does not contradict any of them.
532 /* Most of decisions we do here should have been done at compile time.
533 The nodes of the REx which we used for the search should have been
534 deleted from the finite automaton. */
537 Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos,
538 char *strend, const U32 flags, re_scream_pos_data *data)
541 struct regexp *const prog = (struct regexp *)SvANY(rx);
542 register I32 start_shift = 0;
543 /* Should be nonnegative! */
544 register I32 end_shift = 0;
549 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
551 register char *other_last = NULL; /* other substr checked before this */
552 char *check_at = NULL; /* check substr found at this pos */
553 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
554 RXi_GET_DECL(prog,progi);
556 const char * const i_strpos = strpos;
558 GET_RE_DEBUG_FLAGS_DECL;
560 PERL_ARGS_ASSERT_RE_INTUIT_START;
562 RX_MATCH_UTF8_set(rx,utf8_target);
565 PL_reg_flags |= RF_utf8;
568 debug_start_match(rx, utf8_target, strpos, strend,
569 sv ? "Guessing start of match in sv for"
570 : "Guessing start of match in string for");
573 /* CHR_DIST() would be more correct here but it makes things slow. */
574 if (prog->minlen > strend - strpos) {
575 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
576 "String too short... [re_intuit_start]\n"));
580 strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos;
583 if (!prog->check_utf8 && prog->check_substr)
584 to_utf8_substr(prog);
585 check = prog->check_utf8;
587 if (!prog->check_substr && prog->check_utf8)
588 to_byte_substr(prog);
589 check = prog->check_substr;
591 if (check == &PL_sv_undef) {
592 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
593 "Non-utf8 string cannot match utf8 check string\n"));
596 if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */
597 ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE)
598 || ( (prog->extflags & RXf_ANCH_BOL)
599 && !multiline ) ); /* Check after \n? */
602 if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */
603 && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */
604 /* SvCUR is not set on references: SvRV and SvPVX_const overlap */
606 && (strpos != strbeg)) {
607 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n"));
610 if (prog->check_offset_min == prog->check_offset_max &&
611 !(prog->extflags & RXf_CANY_SEEN)) {
612 /* Substring at constant offset from beg-of-str... */
615 s = HOP3c(strpos, prog->check_offset_min, strend);
618 slen = SvCUR(check); /* >= 1 */
620 if ( strend - s > slen || strend - s < slen - 1
621 || (strend - s == slen && strend[-1] != '\n')) {
622 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too long...\n"));
625 /* Now should match s[0..slen-2] */
627 if (slen && (*SvPVX_const(check) != *s
629 && memNE(SvPVX_const(check), s, slen)))) {
631 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String not equal...\n"));
635 else if (*SvPVX_const(check) != *s
636 || ((slen = SvCUR(check)) > 1
637 && memNE(SvPVX_const(check), s, slen)))
640 goto success_at_start;
643 /* Match is anchored, but substr is not anchored wrt beg-of-str. */
645 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
646 end_shift = prog->check_end_shift;
649 const I32 end = prog->check_offset_max + CHR_SVLEN(check)
650 - (SvTAIL(check) != 0);
651 const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end;
653 if (end_shift < eshift)
657 else { /* Can match at random position */
660 start_shift = prog->check_offset_min; /* okay to underestimate on CC */
661 end_shift = prog->check_end_shift;
663 /* end shift should be non negative here */
666 #ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */
668 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
669 (IV)end_shift, RX_PRECOMP(prog));
673 /* Find a possible match in the region s..strend by looking for
674 the "check" substring in the region corrected by start/end_shift. */
677 I32 srch_start_shift = start_shift;
678 I32 srch_end_shift = end_shift;
679 if (srch_start_shift < 0 && strbeg - s > srch_start_shift) {
680 srch_end_shift -= ((strbeg - s) - srch_start_shift);
681 srch_start_shift = strbeg - s;
683 DEBUG_OPTIMISE_MORE_r({
684 PerlIO_printf(Perl_debug_log, "Check offset min: %"IVdf" Start shift: %"IVdf" End shift %"IVdf" Real End Shift: %"IVdf"\n",
685 (IV)prog->check_offset_min,
686 (IV)srch_start_shift,
688 (IV)prog->check_end_shift);
691 if (flags & REXEC_SCREAM) {
692 I32 p = -1; /* Internal iterator of scream. */
693 I32 * const pp = data ? data->scream_pos : &p;
695 if (PL_screamfirst[BmRARE(check)] >= 0
696 || ( BmRARE(check) == '\n'
697 && (BmPREVIOUS(check) == SvCUR(check) - 1)
699 s = screaminstr(sv, check,
700 srch_start_shift + (s - strbeg), srch_end_shift, pp, 0);
703 /* we may be pointing at the wrong string */
704 if (s && RXp_MATCH_COPIED(prog))
705 s = strbeg + (s - SvPVX_const(sv));
707 *data->scream_olds = s;
712 if (prog->extflags & RXf_CANY_SEEN) {
713 start_point= (U8*)(s + srch_start_shift);
714 end_point= (U8*)(strend - srch_end_shift);
716 start_point= HOP3(s, srch_start_shift, srch_start_shift < 0 ? strbeg : strend);
717 end_point= HOP3(strend, -srch_end_shift, strbeg);
719 DEBUG_OPTIMISE_MORE_r({
720 PerlIO_printf(Perl_debug_log, "fbm_instr len=%d str=<%.*s>\n",
721 (int)(end_point - start_point),
722 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
726 s = fbm_instr( start_point, end_point,
727 check, multiline ? FBMrf_MULTILINE : 0);
730 /* Update the count-of-usability, remove useless subpatterns,
734 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
735 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
736 PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s",
737 (s ? "Found" : "Did not find"),
738 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
739 ? "anchored" : "floating"),
742 (s ? " at offset " : "...\n") );
747 /* Finish the diagnostic message */
748 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(s - i_strpos)) );
750 /* XXX dmq: first branch is for positive lookbehind...
751 Our check string is offset from the beginning of the pattern.
752 So we need to do any stclass tests offset forward from that
761 /* Got a candidate. Check MBOL anchoring, and the *other* substr.
762 Start with the other substr.
763 XXXX no SCREAM optimization yet - and a very coarse implementation
764 XXXX /ttx+/ results in anchored="ttx", floating="x". floating will
765 *always* match. Probably should be marked during compile...
766 Probably it is right to do no SCREAM here...
769 if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8)
770 : (prog->float_substr && prog->anchored_substr))
772 /* Take into account the "other" substring. */
773 /* XXXX May be hopelessly wrong for UTF... */
776 if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) {
779 char * const last = HOP3c(s, -start_shift, strbeg);
781 char * const saved_s = s;
784 t = s - prog->check_offset_max;
785 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
787 || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos))
792 t = HOP3c(t, prog->anchored_offset, strend);
793 if (t < other_last) /* These positions already checked */
795 last2 = last1 = HOP3c(strend, -prog->minlen, strbeg);
798 /* XXXX It is not documented what units *_offsets are in.
799 We assume bytes, but this is clearly wrong.
800 Meaning this code needs to be carefully reviewed for errors.
804 /* On end-of-str: see comment below. */
805 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
806 if (must == &PL_sv_undef) {
808 DEBUG_r(must = prog->anchored_utf8); /* for debug */
813 HOP3(HOP3(last1, prog->anchored_offset, strend)
814 + SvCUR(must), -(SvTAIL(must)!=0), strbeg),
816 multiline ? FBMrf_MULTILINE : 0
819 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
820 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
821 PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s",
822 (s ? "Found" : "Contradicts"),
823 quoted, RE_SV_TAIL(must));
828 if (last1 >= last2) {
829 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
830 ", giving up...\n"));
833 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
834 ", trying floating at offset %ld...\n",
835 (long)(HOP3c(saved_s, 1, strend) - i_strpos)));
836 other_last = HOP3c(last1, prog->anchored_offset+1, strend);
837 s = HOP3c(last, 1, strend);
841 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
842 (long)(s - i_strpos)));
843 t = HOP3c(s, -prog->anchored_offset, strbeg);
844 other_last = HOP3c(s, 1, strend);
852 else { /* Take into account the floating substring. */
854 char * const saved_s = s;
857 t = HOP3c(s, -start_shift, strbeg);
859 HOP3c(strend, -prog->minlen + prog->float_min_offset, strbeg);
860 if (CHR_DIST((U8*)last, (U8*)t) > prog->float_max_offset)
861 last = HOP3c(t, prog->float_max_offset, strend);
862 s = HOP3c(t, prog->float_min_offset, strend);
865 /* XXXX It is not documented what units *_offsets are in. Assume bytes. */
866 must = utf8_target ? prog->float_utf8 : prog->float_substr;
867 /* fbm_instr() takes into account exact value of end-of-str
868 if the check is SvTAIL(ed). Since false positives are OK,
869 and end-of-str is not later than strend we are OK. */
870 if (must == &PL_sv_undef) {
872 DEBUG_r(must = prog->float_utf8); /* for debug message */
875 s = fbm_instr((unsigned char*)s,
876 (unsigned char*)last + SvCUR(must)
878 must, multiline ? FBMrf_MULTILINE : 0);
880 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
881 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
882 PerlIO_printf(Perl_debug_log, "%s floating substr %s%s",
883 (s ? "Found" : "Contradicts"),
884 quoted, RE_SV_TAIL(must));
888 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
889 ", giving up...\n"));
892 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
893 ", trying anchored starting at offset %ld...\n",
894 (long)(saved_s + 1 - i_strpos)));
896 s = HOP3c(t, 1, strend);
900 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
901 (long)(s - i_strpos)));
902 other_last = s; /* Fix this later. --Hugo */
912 t= (char*)HOP3( s, -prog->check_offset_max, (prog->check_offset_max<0) ? strend : strpos);
914 DEBUG_OPTIMISE_MORE_r(
915 PerlIO_printf(Perl_debug_log,
916 "Check offset min:%"IVdf" max:%"IVdf" S:%"IVdf" t:%"IVdf" D:%"IVdf" end:%"IVdf"\n",
917 (IV)prog->check_offset_min,
918 (IV)prog->check_offset_max,
926 if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */
928 || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos)))
931 /* Fixed substring is found far enough so that the match
932 cannot start at strpos. */
934 if (ml_anch && t[-1] != '\n') {
935 /* Eventually fbm_*() should handle this, but often
936 anchored_offset is not 0, so this check will not be wasted. */
937 /* XXXX In the code below we prefer to look for "^" even in
938 presence of anchored substrings. And we search even
939 beyond the found float position. These pessimizations
940 are historical artefacts only. */
942 while (t < strend - prog->minlen) {
944 if (t < check_at - prog->check_offset_min) {
945 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
946 /* Since we moved from the found position,
947 we definitely contradict the found anchored
948 substr. Due to the above check we do not
949 contradict "check" substr.
950 Thus we can arrive here only if check substr
951 is float. Redo checking for "other"=="fixed".
954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
955 PL_colors[0], PL_colors[1], (long)(strpos - i_strpos), (long)(strpos - i_strpos + prog->anchored_offset)));
956 goto do_other_anchored;
958 /* We don't contradict the found floating substring. */
959 /* XXXX Why not check for STCLASS? */
961 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m at offset %ld...\n",
962 PL_colors[0], PL_colors[1], (long)(s - i_strpos)));
965 /* Position contradicts check-string */
966 /* XXXX probably better to look for check-string
967 than for "\n", so one should lower the limit for t? */
968 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
969 PL_colors[0], PL_colors[1], (long)(t + 1 - i_strpos)));
970 other_last = strpos = s = t + 1;
975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Did not find /%s^%s/m...\n",
976 PL_colors[0], PL_colors[1]));
980 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Starting position does not contradict /%s^%s/m...\n",
981 PL_colors[0], PL_colors[1]));
985 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
988 /* The found string does not prohibit matching at strpos,
989 - no optimization of calling REx engine can be performed,
990 unless it was an MBOL and we are not after MBOL,
991 or a future STCLASS check will fail this. */
993 /* Even in this situation we may use MBOL flag if strpos is offset
994 wrt the start of the string. */
995 if (ml_anch && sv && !SvROK(sv) /* See prev comment on SvROK */
996 && (strpos != strbeg) && strpos[-1] != '\n'
997 /* May be due to an implicit anchor of m{.*foo} */
998 && !(prog->intflags & PREGf_IMPLICIT))
1003 DEBUG_EXECUTE_r( if (ml_anch)
1004 PerlIO_printf(Perl_debug_log, "Position at offset %ld does not contradict /%s^%s/m...\n",
1005 (long)(strpos - i_strpos), PL_colors[0], PL_colors[1]);
1008 if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */
1010 prog->check_utf8 /* Could be deleted already */
1011 && --BmUSEFUL(prog->check_utf8) < 0
1012 && (prog->check_utf8 == prog->float_utf8)
1014 prog->check_substr /* Could be deleted already */
1015 && --BmUSEFUL(prog->check_substr) < 0
1016 && (prog->check_substr == prog->float_substr)
1019 /* If flags & SOMETHING - do not do it many times on the same match */
1020 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n"));
1021 /* XXX Does the destruction order has to change with utf8_target? */
1022 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1023 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1024 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1025 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1026 check = NULL; /* abort */
1028 /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag
1029 see http://bugs.activestate.com/show_bug.cgi?id=87173 */
1030 if (prog->intflags & PREGf_IMPLICIT)
1031 prog->extflags &= ~RXf_ANCH_MBOL;
1032 /* XXXX This is a remnant of the old implementation. It
1033 looks wasteful, since now INTUIT can use many
1034 other heuristics. */
1035 prog->extflags &= ~RXf_USE_INTUIT;
1036 /* XXXX What other flags might need to be cleared in this branch? */
1042 /* Last resort... */
1043 /* XXXX BmUSEFUL already changed, maybe multiple change is meaningful... */
1044 /* trie stclasses are too expensive to use here, we are better off to
1045 leave it to regmatch itself */
1046 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1047 /* minlen == 0 is possible if regstclass is \b or \B,
1048 and the fixed substr is ''$.
1049 Since minlen is already taken into account, s+1 is before strend;
1050 accidentally, minlen >= 1 guaranties no false positives at s + 1
1051 even for \b or \B. But (minlen? 1 : 0) below assumes that
1052 regstclass does not come from lookahead... */
1053 /* If regstclass takes bytelength more than 1: If charlength==1, OK.
1054 This leaves EXACTF only, which is dealt with in find_byclass(). */
1055 const U8* const str = (U8*)STRING(progi->regstclass);
1056 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1057 ? CHR_DIST(str+STR_LEN(progi->regstclass), str)
1060 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1061 endpos= HOP3c(s, (prog->minlen ? cl_l : 0), strend);
1062 else if (prog->float_substr || prog->float_utf8)
1063 endpos= HOP3c(HOP3c(check_at, -start_shift, strbeg), cl_l, strend);
1067 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n",
1068 (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg)));
1071 s = find_byclass(prog, progi->regstclass, s, endpos, NULL);
1074 const char *what = NULL;
1076 if (endpos == strend) {
1077 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1078 "Could not match STCLASS...\n") );
1081 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1082 "This position contradicts STCLASS...\n") );
1083 if ((prog->extflags & RXf_ANCH) && !ml_anch)
1085 /* Contradict one of substrings */
1086 if (prog->anchored_substr || prog->anchored_utf8) {
1087 if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) {
1088 DEBUG_EXECUTE_r( what = "anchored" );
1090 s = HOP3c(t, 1, strend);
1091 if (s + start_shift + end_shift > strend) {
1092 /* XXXX Should be taken into account earlier? */
1093 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1094 "Could not match STCLASS...\n") );
1099 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1100 "Looking for %s substr starting at offset %ld...\n",
1101 what, (long)(s + start_shift - i_strpos)) );
1104 /* Have both, check_string is floating */
1105 if (t + start_shift >= check_at) /* Contradicts floating=check */
1106 goto retry_floating_check;
1107 /* Recheck anchored substring, but not floating... */
1111 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1112 "Looking for anchored substr starting at offset %ld...\n",
1113 (long)(other_last - i_strpos)) );
1114 goto do_other_anchored;
1116 /* Another way we could have checked stclass at the
1117 current position only: */
1122 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1123 "Looking for /%s^%s/m starting at offset %ld...\n",
1124 PL_colors[0], PL_colors[1], (long)(t - i_strpos)) );
1127 if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */
1129 /* Check is floating subtring. */
1130 retry_floating_check:
1131 t = check_at - start_shift;
1132 DEBUG_EXECUTE_r( what = "floating" );
1133 goto hop_and_restart;
1136 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1137 "By STCLASS: moving %ld --> %ld\n",
1138 (long)(t - i_strpos), (long)(s - i_strpos))
1142 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1143 "Does not contradict STCLASS...\n");
1148 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s%s:%s match at offset %ld\n",
1149 PL_colors[4], (check ? "Guessed" : "Giving up"),
1150 PL_colors[5], (long)(s - i_strpos)) );
1153 fail_finish: /* Substring not found */
1154 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1155 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1157 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1158 PL_colors[4], PL_colors[5]));
1162 #define DECL_TRIE_TYPE(scan) \
1163 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \
1164 trie_type = (scan->flags != EXACT) \
1165 ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \
1166 : (utf8_target ? trie_utf8 : trie_plain)
1168 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \
1169 uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \
1170 switch (trie_type) { \
1171 case trie_utf8_fold: \
1172 if ( foldlen>0 ) { \
1173 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1178 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1179 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1180 foldlen -= UNISKIP( uvc ); \
1181 uscan = foldbuf + UNISKIP( uvc ); \
1184 case trie_latin_utf8_fold: \
1185 if ( foldlen>0 ) { \
1186 uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
1192 uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \
1193 foldlen -= UNISKIP( uvc ); \
1194 uscan = foldbuf + UNISKIP( uvc ); \
1198 uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \
1205 charid = trie->charmap[ uvc ]; \
1209 if (widecharmap) { \
1210 SV** const svpp = hv_fetch(widecharmap, \
1211 (char*)&uvc, sizeof(UV), 0); \
1213 charid = (U16)SvIV(*svpp); \
1218 #define REXEC_FBC_EXACTISH_CHECK(CoNd) \
1220 char *my_strend= (char *)strend; \
1223 foldEQ_utf8(s, &my_strend, 0, utf8_target, \
1224 m, NULL, ln, cBOOL(UTF_PATTERN))) \
1225 && (!reginfo || regtry(reginfo, &s)) ) \
1228 U8 foldbuf[UTF8_MAXBYTES_CASE+1]; \
1229 uvchr_to_utf8(tmpbuf, c); \
1230 f = to_utf8_fold(tmpbuf, foldbuf, &foldlen); \
1232 && (f == c1 || f == c2) \
1234 foldEQ_utf8(s, &my_strend, 0, utf8_target,\
1235 m, NULL, ln, cBOOL(UTF_PATTERN)))\
1236 && (!reginfo || regtry(reginfo, &s)) ) \
1242 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1246 && (ln == 1 || (OP(c) == EXACTF \
1247 ? foldEQ(s, m, ln) \
1248 : foldEQ_locale(s, m, ln))) \
1249 && (!reginfo || regtry(reginfo, &s)) ) \
1255 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1257 while (s + (uskip = UTF8SKIP(s)) <= strend) { \
1263 #define REXEC_FBC_SCAN(CoDe) \
1265 while (s < strend) { \
1271 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1272 REXEC_FBC_UTF8_SCAN( \
1274 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1283 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1286 if (tmp && (!reginfo || regtry(reginfo, &s))) \
1295 #define REXEC_FBC_TRYIT \
1296 if ((!reginfo || regtry(reginfo, &s))) \
1299 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1300 if (utf8_target) { \
1301 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1304 REXEC_FBC_CLASS_SCAN(CoNd); \
1308 #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \
1309 if (utf8_target) { \
1311 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1314 REXEC_FBC_CLASS_SCAN(CoNd); \
1318 #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \
1319 PL_reg_flags |= RF_tainted; \
1320 if (utf8_target) { \
1321 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1324 REXEC_FBC_CLASS_SCAN(CoNd); \
1328 #define DUMP_EXEC_POS(li,s,doutf8) \
1329 dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8)
1331 /* We know what class REx starts with. Try to find this position... */
1332 /* if reginfo is NULL, its a dryrun */
1333 /* annoyingly all the vars in this routine have different names from their counterparts
1334 in regmatch. /grrr */
1337 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1338 const char *strend, regmatch_info *reginfo)
1341 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1345 register STRLEN uskip;
1349 register I32 tmp = 1; /* Scratch variable? */
1350 register const bool utf8_target = PL_reg_match_utf8;
1351 RXi_GET_DECL(prog,progi);
1353 PERL_ARGS_ASSERT_FIND_BYCLASS;
1355 /* We know what class it must start with. */
1359 REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) ||
1360 !UTF8_IS_INVARIANT((U8)s[0]) ?
1361 reginclass(prog, c, (U8*)s, 0, utf8_target) :
1362 REGINCLASS(prog, c, (U8*)s));
1365 while (s < strend) {
1368 if (REGINCLASS(prog, c, (U8*)s) ||
1369 (ANYOF_FOLD_SHARP_S(c, s, strend) &&
1370 /* The assignment of 2 is intentional:
1371 * for the folded sharp s, the skip is 2. */
1372 (skip = SHARP_S_SKIP))) {
1373 if (tmp && (!reginfo || regtry(reginfo, &s)))
1386 if (tmp && (!reginfo || regtry(reginfo, &s)))
1394 ln = STR_LEN(c); /* length to match in octets/bytes */
1395 lnc = (I32) ln; /* length to match in characters */
1397 STRLEN ulen1, ulen2;
1399 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
1400 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
1401 /* used by commented-out code below */
1402 /*const U32 uniflags = UTF8_ALLOW_DEFAULT;*/
1404 /* XXX: Since the node will be case folded at compile
1405 time this logic is a little odd, although im not
1406 sure that its actually wrong. --dmq */
1408 c1 = to_utf8_lower((U8*)m, tmpbuf1, &ulen1);
1409 c2 = to_utf8_upper((U8*)m, tmpbuf2, &ulen2);
1411 /* XXX: This is kinda strange. to_utf8_XYZ returns the
1412 codepoint of the first character in the converted
1413 form, yet originally we did the extra step.
1414 No tests fail by commenting this code out however
1415 so Ive left it out. -- dmq.
1417 c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXBYTES_CASE,
1419 c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXBYTES_CASE,
1424 while (sm < ((U8 *) m + ln)) {
1439 c2 = PL_fold_locale[c1];
1441 e = HOP3c(strend, -((I32)lnc), s);
1443 if (!reginfo && e < s)
1444 e = s; /* Due to minlen logic of intuit() */
1446 /* The idea in the EXACTF* cases is to first find the
1447 * first character of the EXACTF* node and then, if
1448 * necessary, case-insensitively compare the full
1449 * text of the node. The c1 and c2 are the first
1450 * characters (though in Unicode it gets a bit
1451 * more complicated because there are more cases
1452 * than just upper and lower: one needs to use
1453 * the so-called folding case for case-insensitive
1454 * matching (called "loose matching" in Unicode).
1455 * foldEQ_utf8() will do just that. */
1457 if (utf8_target || UTF_PATTERN) {
1459 U8 tmpbuf [UTF8_MAXBYTES+1];
1462 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1464 /* Upper and lower of 1st char are equal -
1465 * probably not a "letter". */
1468 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1473 REXEC_FBC_EXACTISH_CHECK(c == c1);
1479 c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len,
1485 /* Handle some of the three Greek sigmas cases.
1486 * Note that not all the possible combinations
1487 * are handled here: some of them are handled
1488 * by the standard folding rules, and some of
1489 * them (the character class or ANYOF cases)
1490 * are handled during compiletime in
1491 * regexec.c:S_regclass(). */
1492 if (c == (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA ||
1493 c == (UV)UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA)
1494 c = (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA;
1496 REXEC_FBC_EXACTISH_CHECK(c == c1 || c == c2);
1501 /* Neither pattern nor string are UTF8 */
1503 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1505 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1509 PL_reg_flags |= RF_tainted;
1516 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1517 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1519 tmp = ((OP(c) == BOUND ?
1520 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1521 LOAD_UTF8_CHARCLASS_ALNUM();
1522 REXEC_FBC_UTF8_SCAN(
1523 if (tmp == !(OP(c) == BOUND ?
1524 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1525 isALNUM_LC_utf8((U8*)s)))
1532 else { /* Not utf8 */
1533 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1534 tmp = cBOOL((OP(c) == BOUNDL)
1536 : (isWORDCHAR_L1(tmp)
1537 && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1542 : (isWORDCHAR_L1((U8) *s)
1543 && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1550 if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s)))
1554 PL_reg_flags |= RF_tainted;
1561 U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr);
1562 tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT);
1564 tmp = ((OP(c) == NBOUND ?
1565 isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0);
1566 LOAD_UTF8_CHARCLASS_ALNUM();
1567 REXEC_FBC_UTF8_SCAN(
1568 if (tmp == !(OP(c) == NBOUND ?
1569 cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) :
1570 isALNUM_LC_utf8((U8*)s)))
1572 else REXEC_FBC_TRYIT;
1576 tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n';
1577 tmp = cBOOL((OP(c) == NBOUNDL)
1579 : (isWORDCHAR_L1(tmp)
1580 && (isASCII(tmp) || (FLAGS(c) & USE_UNI))));
1585 : (isWORDCHAR_L1((U8) *s)
1586 && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))))
1590 else REXEC_FBC_TRYIT;
1593 if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s)))
1597 REXEC_FBC_CSCAN_PRELOAD(
1598 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1599 swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1600 (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)
1603 REXEC_FBC_CSCAN_TAINT(
1604 isALNUM_LC_utf8((U8*)s),
1608 REXEC_FBC_CSCAN_PRELOAD(
1609 LOAD_UTF8_CHARCLASS_PERL_WORD(),
1610 !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target),
1611 ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s))
1614 REXEC_FBC_CSCAN_TAINT(
1615 !isALNUM_LC_utf8((U8*)s),
1619 REXEC_FBC_CSCAN_PRELOAD(
1620 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1621 *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target),
1622 isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))
1625 REXEC_FBC_CSCAN_TAINT(
1626 *s == ' ' || isSPACE_LC_utf8((U8*)s),
1630 REXEC_FBC_CSCAN_PRELOAD(
1631 LOAD_UTF8_CHARCLASS_PERL_SPACE(),
1632 !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)),
1633 !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)))
1636 REXEC_FBC_CSCAN_TAINT(
1637 !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
1641 REXEC_FBC_CSCAN_PRELOAD(
1642 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1643 swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1647 REXEC_FBC_CSCAN_TAINT(
1648 isDIGIT_LC_utf8((U8*)s),
1652 REXEC_FBC_CSCAN_PRELOAD(
1653 LOAD_UTF8_CHARCLASS_POSIX_DIGIT(),
1654 !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target),
1658 REXEC_FBC_CSCAN_TAINT(
1659 !isDIGIT_LC_utf8((U8*)s),
1665 is_LNBREAK_latin1(s)
1675 !is_VERTWS_latin1(s)
1680 is_HORIZWS_latin1(s)
1684 !is_HORIZWS_utf8(s),
1685 !is_HORIZWS_latin1(s)
1691 /* what trie are we using right now */
1693 = (reg_ac_data*)progi->data->data[ ARG( c ) ];
1695 = (reg_trie_data*)progi->data->data[ aho->trie ];
1696 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
1698 const char *last_start = strend - trie->minlen;
1700 const char *real_start = s;
1702 STRLEN maxlen = trie->maxlen;
1704 U8 **points; /* map of where we were in the input string
1705 when reading a given char. For ASCII this
1706 is unnecessary overhead as the relationship
1707 is always 1:1, but for Unicode, especially
1708 case folded Unicode this is not true. */
1709 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1713 GET_RE_DEBUG_FLAGS_DECL;
1715 /* We can't just allocate points here. We need to wrap it in
1716 * an SV so it gets freed properly if there is a croak while
1717 * running the match */
1720 sv_points=newSV(maxlen * sizeof(U8 *));
1721 SvCUR_set(sv_points,
1722 maxlen * sizeof(U8 *));
1723 SvPOK_on(sv_points);
1724 sv_2mortal(sv_points);
1725 points=(U8**)SvPV_nolen(sv_points );
1726 if ( trie_type != trie_utf8_fold
1727 && (trie->bitmap || OP(c)==AHOCORASICKC) )
1730 bitmap=(U8*)trie->bitmap;
1732 bitmap=(U8*)ANYOF_BITMAP(c);
1734 /* this is the Aho-Corasick algorithm modified a touch
1735 to include special handling for long "unknown char"
1736 sequences. The basic idea being that we use AC as long
1737 as we are dealing with a possible matching char, when
1738 we encounter an unknown char (and we have not encountered
1739 an accepting state) we scan forward until we find a legal
1741 AC matching is basically that of trie matching, except
1742 that when we encounter a failing transition, we fall back
1743 to the current states "fail state", and try the current char
1744 again, a process we repeat until we reach the root state,
1745 state 1, or a legal transition. If we fail on the root state
1746 then we can either terminate if we have reached an accepting
1747 state previously, or restart the entire process from the beginning
1751 while (s <= last_start) {
1752 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1760 U8 *uscan = (U8*)NULL;
1761 U8 *leftmost = NULL;
1763 U32 accepted_word= 0;
1767 while ( state && uc <= (U8*)strend ) {
1769 U32 word = aho->states[ state ].wordnum;
1773 DEBUG_TRIE_EXECUTE_r(
1774 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1775 dump_exec_pos( (char *)uc, c, strend, real_start,
1776 (char *)uc, utf8_target );
1777 PerlIO_printf( Perl_debug_log,
1778 " Scanning for legal start char...\n");
1781 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
1786 if (uc >(U8*)last_start) break;
1790 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
1791 if (!leftmost || lpos < leftmost) {
1792 DEBUG_r(accepted_word=word);
1798 points[pointpos++ % maxlen]= uc;
1799 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
1800 uscan, len, uvc, charid, foldlen,
1802 DEBUG_TRIE_EXECUTE_r({
1803 dump_exec_pos( (char *)uc, c, strend, real_start,
1805 PerlIO_printf(Perl_debug_log,
1806 " Charid:%3u CP:%4"UVxf" ",
1812 word = aho->states[ state ].wordnum;
1814 base = aho->states[ state ].trans.base;
1816 DEBUG_TRIE_EXECUTE_r({
1818 dump_exec_pos( (char *)uc, c, strend, real_start,
1820 PerlIO_printf( Perl_debug_log,
1821 "%sState: %4"UVxf", word=%"UVxf,
1822 failed ? " Fail transition to " : "",
1823 (UV)state, (UV)word);
1829 ( ((offset = base + charid
1830 - 1 - trie->uniquecharcount)) >= 0)
1831 && ((U32)offset < trie->lasttrans)
1832 && trie->trans[offset].check == state
1833 && (tmp=trie->trans[offset].next))
1835 DEBUG_TRIE_EXECUTE_r(
1836 PerlIO_printf( Perl_debug_log," - legal\n"));
1841 DEBUG_TRIE_EXECUTE_r(
1842 PerlIO_printf( Perl_debug_log," - fail\n"));
1844 state = aho->fail[state];
1848 /* we must be accepting here */
1849 DEBUG_TRIE_EXECUTE_r(
1850 PerlIO_printf( Perl_debug_log," - accepting\n"));
1859 if (!state) state = 1;
1862 if ( aho->states[ state ].wordnum ) {
1863 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
1864 if (!leftmost || lpos < leftmost) {
1865 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
1870 s = (char*)leftmost;
1871 DEBUG_TRIE_EXECUTE_r({
1873 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
1874 (UV)accepted_word, (IV)(s - real_start)
1877 if (!reginfo || regtry(reginfo, &s)) {
1883 DEBUG_TRIE_EXECUTE_r({
1884 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
1887 DEBUG_TRIE_EXECUTE_r(
1888 PerlIO_printf( Perl_debug_log,"No match.\n"));
1897 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
1907 - regexec_flags - match a regexp against a string
1910 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *strend,
1911 char *strbeg, I32 minend, SV *sv, void *data, U32 flags)
1912 /* strend: pointer to null at end of string */
1913 /* strbeg: real beginning of string */
1914 /* minend: end of match must be >=minend after stringarg. */
1915 /* data: May be used for some additional optimizations.
1916 Currently its only used, with a U32 cast, for transmitting
1917 the ganch offset when doing a /g match. This will change */
1918 /* nosave: For optimizations. */
1921 struct regexp *const prog = (struct regexp *)SvANY(rx);
1922 /*register*/ char *s;
1923 register regnode *c;
1924 /*register*/ char *startpos = stringarg;
1925 I32 minlen; /* must match at least this many chars */
1926 I32 dontbother = 0; /* how many characters not to try at end */
1927 I32 end_shift = 0; /* Same for the end. */ /* CC */
1928 I32 scream_pos = -1; /* Internal iterator of scream. */
1929 char *scream_olds = NULL;
1930 const bool utf8_target = cBOOL(DO_UTF8(sv));
1932 RXi_GET_DECL(prog,progi);
1933 regmatch_info reginfo; /* create some info to pass to regtry etc */
1934 regexp_paren_pair *swap = NULL;
1935 GET_RE_DEBUG_FLAGS_DECL;
1937 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
1938 PERL_UNUSED_ARG(data);
1940 /* Be paranoid... */
1941 if (prog == NULL || startpos == NULL) {
1942 Perl_croak(aTHX_ "NULL regexp parameter");
1946 multiline = prog->extflags & RXf_PMf_MULTILINE;
1947 reginfo.prog = rx; /* Yes, sorry that this is confusing. */
1949 RX_MATCH_UTF8_set(rx, utf8_target);
1951 debug_start_match(rx, utf8_target, startpos, strend,
1955 minlen = prog->minlen;
1957 if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
1958 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1959 "String too short [regexec_flags]...\n"));
1964 /* Check validity of program. */
1965 if (UCHARAT(progi->program) != REG_MAGIC) {
1966 Perl_croak(aTHX_ "corrupted regexp program");
1970 PL_reg_eval_set = 0;
1974 PL_reg_flags |= RF_utf8;
1976 /* Mark beginning of line for ^ and lookbehind. */
1977 reginfo.bol = startpos; /* XXX not used ??? */
1981 /* Mark end of line for $ (and such) */
1984 /* see how far we have to get to not match where we matched before */
1985 reginfo.till = startpos+minend;
1987 /* If there is a "must appear" string, look for it. */
1990 if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */
1992 if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */
1993 reginfo.ganch = startpos + prog->gofs;
1994 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
1995 "GPOS IGNOREPOS: reginfo.ganch = startpos + %"UVxf"\n",(UV)prog->gofs));
1996 } else if (sv && SvTYPE(sv) >= SVt_PVMG
1998 && (mg = mg_find(sv, PERL_MAGIC_regex_global))
1999 && mg->mg_len >= 0) {
2000 reginfo.ganch = strbeg + mg->mg_len; /* Defined pos() */
2001 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2002 "GPOS MAGIC: reginfo.ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len));
2004 if (prog->extflags & RXf_ANCH_GPOS) {
2005 if (s > reginfo.ganch)
2007 s = reginfo.ganch - prog->gofs;
2008 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2009 "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs));
2015 reginfo.ganch = strbeg + PTR2UV(data);
2016 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2017 "GPOS DATA: reginfo.ganch= strbeg + %"UVxf"\n",PTR2UV(data)));
2019 } else { /* pos() not defined */
2020 reginfo.ganch = strbeg;
2021 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2022 "GPOS: reginfo.ganch = strbeg\n"));
2025 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2026 /* We have to be careful. If the previous successful match
2027 was from this regex we don't want a subsequent partially
2028 successful match to clobber the old results.
2029 So when we detect this possibility we add a swap buffer
2030 to the re, and switch the buffer each match. If we fail
2031 we switch it back, otherwise we leave it swapped.
2034 /* do we need a save destructor here for eval dies? */
2035 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2037 if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) {
2038 re_scream_pos_data d;
2040 d.scream_olds = &scream_olds;
2041 d.scream_pos = &scream_pos;
2042 s = re_intuit_start(rx, sv, s, strend, flags, &d);
2044 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n"));
2045 goto phooey; /* not present */
2051 /* Simplest case: anchored match need be tried only once. */
2052 /* [unless only anchor is BOL and multiline is set] */
2053 if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) {
2054 if (s == startpos && regtry(®info, &startpos))
2056 else if (multiline || (prog->intflags & PREGf_IMPLICIT)
2057 || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */
2062 dontbother = minlen - 1;
2063 end = HOP3c(strend, -dontbother, strbeg) - 1;
2064 /* for multiline we only have to try after newlines */
2065 if (prog->check_substr || prog->check_utf8) {
2066 /* because of the goto we can not easily reuse the macros for bifurcating the
2067 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2070 goto after_try_utf8;
2072 if (regtry(®info, &s)) {
2079 if (prog->extflags & RXf_USE_INTUIT) {
2080 s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL);
2089 } /* end search for check string in unicode */
2091 if (s == startpos) {
2092 goto after_try_latin;
2095 if (regtry(®info, &s)) {
2102 if (prog->extflags & RXf_USE_INTUIT) {
2103 s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL);
2112 } /* end search for check string in latin*/
2113 } /* end search for check string */
2114 else { /* search for newline */
2116 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2119 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2121 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2122 if (regtry(®info, &s))
2126 } /* end search for newline */
2127 } /* end anchored/multiline check string search */
2129 } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK))
2131 /* the warning about reginfo.ganch being used without intialization
2132 is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN
2133 and we only enter this block when the same bit is set. */
2134 char *tmp_s = reginfo.ganch - prog->gofs;
2136 if (tmp_s >= strbeg && regtry(®info, &tmp_s))
2141 /* Messy cases: unanchored match. */
2142 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2143 /* we have /x+whatever/ */
2144 /* it must be a one character string (XXXX Except UTF_PATTERN?) */
2149 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2150 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2151 ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0];
2156 DEBUG_EXECUTE_r( did_match = 1 );
2157 if (regtry(®info, &s)) goto got_it;
2159 while (s < strend && *s == ch)
2167 DEBUG_EXECUTE_r( did_match = 1 );
2168 if (regtry(®info, &s)) goto got_it;
2170 while (s < strend && *s == ch)
2175 DEBUG_EXECUTE_r(if (!did_match)
2176 PerlIO_printf(Perl_debug_log,
2177 "Did not find anchored character...\n")
2180 else if (prog->anchored_substr != NULL
2181 || prog->anchored_utf8 != NULL
2182 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2183 && prog->float_max_offset < strend - s)) {
2188 char *last1; /* Last position checked before */
2192 if (prog->anchored_substr || prog->anchored_utf8) {
2193 if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr))
2194 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2195 must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr;
2196 back_max = back_min = prog->anchored_offset;
2198 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2199 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2200 must = utf8_target ? prog->float_utf8 : prog->float_substr;
2201 back_max = prog->float_max_offset;
2202 back_min = prog->float_min_offset;
2206 if (must == &PL_sv_undef)
2207 /* could not downgrade utf8 check substring, so must fail */
2213 last = HOP3c(strend, /* Cannot start after this */
2214 -(I32)(CHR_SVLEN(must)
2215 - (SvTAIL(must) != 0) + back_min), strbeg);
2218 last1 = HOPc(s, -1);
2220 last1 = s - 1; /* bogus */
2222 /* XXXX check_substr already used to find "s", can optimize if
2223 check_substr==must. */
2225 dontbother = end_shift;
2226 strend = HOPc(strend, -dontbother);
2227 while ( (s <= last) &&
2228 ((flags & REXEC_SCREAM)
2229 ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg,
2230 end_shift, &scream_pos, 0))
2231 : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)),
2232 (unsigned char*)strend, must,
2233 multiline ? FBMrf_MULTILINE : 0))) ) {
2234 /* we may be pointing at the wrong string */
2235 if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog))
2236 s = strbeg + (s - SvPVX_const(sv));
2237 DEBUG_EXECUTE_r( did_match = 1 );
2238 if (HOPc(s, -back_max) > last1) {
2239 last1 = HOPc(s, -back_min);
2240 s = HOPc(s, -back_max);
2243 char * const t = (last1 >= PL_bostr) ? HOPc(last1, 1) : last1 + 1;
2245 last1 = HOPc(s, -back_min);
2249 while (s <= last1) {
2250 if (regtry(®info, &s))
2256 while (s <= last1) {
2257 if (regtry(®info, &s))
2263 DEBUG_EXECUTE_r(if (!did_match) {
2264 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2265 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2266 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2267 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2268 ? "anchored" : "floating"),
2269 quoted, RE_SV_TAIL(must));
2273 else if ( (c = progi->regstclass) ) {
2275 const OPCODE op = OP(progi->regstclass);
2276 /* don't bother with what can't match */
2277 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2278 strend = HOPc(strend, -(minlen - 1));
2281 SV * const prop = sv_newmortal();
2282 regprop(prog, prop, c);
2284 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2286 PerlIO_printf(Perl_debug_log,
2287 "Matching stclass %.*s against %s (%d bytes)\n",
2288 (int)SvCUR(prop), SvPVX_const(prop),
2289 quoted, (int)(strend - s));
2292 if (find_byclass(prog, c, s, strend, ®info))
2294 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2298 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2303 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
2304 utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog);
2305 float_real = utf8_target ? prog->float_utf8 : prog->float_substr;
2307 if (flags & REXEC_SCREAM) {
2308 last = screaminstr(sv, float_real, s - strbeg,
2309 end_shift, &scream_pos, 1); /* last one */
2311 last = scream_olds; /* Only one occurrence. */
2312 /* we may be pointing at the wrong string */
2313 else if (RXp_MATCH_COPIED(prog))
2314 s = strbeg + (s - SvPVX_const(sv));
2318 const char * const little = SvPV_const(float_real, len);
2320 if (SvTAIL(float_real)) {
2321 if (memEQ(strend - len + 1, little, len - 1))
2322 last = strend - len + 1;
2323 else if (!multiline)
2324 last = memEQ(strend - len, little, len)
2325 ? strend - len : NULL;
2331 last = rninstr(s, strend, little, little + len);
2333 last = strend; /* matching "$" */
2338 PerlIO_printf(Perl_debug_log,
2339 "%sCan't trim the tail, match fails (should not happen)%s\n",
2340 PL_colors[4], PL_colors[5]));
2341 goto phooey; /* Should not happen! */
2343 dontbother = strend - last + prog->float_min_offset;
2345 if (minlen && (dontbother < minlen))
2346 dontbother = minlen - 1;
2347 strend -= dontbother; /* this one's always in bytes! */
2348 /* We don't know much -- general case. */
2351 if (regtry(®info, &s))
2360 if (regtry(®info, &s))
2362 } while (s++ < strend);
2371 RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
2373 if (PL_reg_eval_set)
2374 restore_pos(aTHX_ prog);
2375 if (RXp_PAREN_NAMES(prog))
2376 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
2378 /* make sure $`, $&, $', and $digit will work later */
2379 if ( !(flags & REXEC_NOT_FIRST) ) {
2380 RX_MATCH_COPY_FREE(rx);
2381 if (flags & REXEC_COPY_STR) {
2382 const I32 i = PL_regeol - startpos + (stringarg - strbeg);
2383 #ifdef PERL_OLD_COPY_ON_WRITE
2385 || (SvFLAGS(sv) & CAN_COW_MASK) == CAN_COW_FLAGS)) {
2387 PerlIO_printf(Perl_debug_log,
2388 "Copy on write: regexp capture, type %d\n",
2391 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2392 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2393 assert (SvPOKp(prog->saved_copy));
2397 RX_MATCH_COPIED_on(rx);
2398 s = savepvn(strbeg, i);
2404 prog->subbeg = strbeg;
2405 prog->sublen = PL_regeol - strbeg; /* strend may have been modified */
2412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
2413 PL_colors[4], PL_colors[5]));
2414 if (PL_reg_eval_set)
2415 restore_pos(aTHX_ prog);
2417 /* we failed :-( roll it back */
2418 Safefree(prog->offs);
2427 - regtry - try match at specific point
2429 STATIC I32 /* 0 failure, 1 success */
2430 S_regtry(pTHX_ regmatch_info *reginfo, char **startpos)
2434 REGEXP *const rx = reginfo->prog;
2435 regexp *const prog = (struct regexp *)SvANY(rx);
2436 RXi_GET_DECL(prog,progi);
2437 GET_RE_DEBUG_FLAGS_DECL;
2439 PERL_ARGS_ASSERT_REGTRY;
2441 reginfo->cutpoint=NULL;
2443 if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) {
2446 PL_reg_eval_set = RS_init;
2447 DEBUG_EXECUTE_r(DEBUG_s(
2448 PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n",
2449 (IV)(PL_stack_sp - PL_stack_base));
2452 cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base;
2453 /* Otherwise OP_NEXTSTATE will free whatever on stack now. */
2455 /* Apparently this is not needed, judging by wantarray. */
2456 /* SAVEI8(cxstack[cxstack_ix].blk_gimme);
2457 cxstack[cxstack_ix].blk_gimme = G_SCALAR; */
2460 /* Make $_ available to executed code. */
2461 if (reginfo->sv != DEFSV) {
2463 DEFSV_set(reginfo->sv);
2466 if (!(SvTYPE(reginfo->sv) >= SVt_PVMG && SvMAGIC(reginfo->sv)
2467 && (mg = mg_find(reginfo->sv, PERL_MAGIC_regex_global)))) {
2468 /* prepare for quick setting of pos */
2469 #ifdef PERL_OLD_COPY_ON_WRITE
2470 if (SvIsCOW(reginfo->sv))
2471 sv_force_normal_flags(reginfo->sv, 0);
2473 mg = sv_magicext(reginfo->sv, NULL, PERL_MAGIC_regex_global,
2474 &PL_vtbl_mglob, NULL, 0);
2478 PL_reg_oldpos = mg->mg_len;
2479 SAVEDESTRUCTOR_X(restore_pos, prog);
2481 if (!PL_reg_curpm) {
2482 Newxz(PL_reg_curpm, 1, PMOP);
2485 SV* const repointer = &PL_sv_undef;
2486 /* this regexp is also owned by the new PL_reg_curpm, which
2487 will try to free it. */
2488 av_push(PL_regex_padav, repointer);
2489 PL_reg_curpm->op_pmoffset = av_len(PL_regex_padav);
2490 PL_regex_pad = AvARRAY(PL_regex_padav);
2495 /* It seems that non-ithreads works both with and without this code.
2496 So for efficiency reasons it seems best not to have the code
2497 compiled when it is not needed. */
2498 /* This is safe against NULLs: */
2499 ReREFCNT_dec(PM_GETRE(PL_reg_curpm));
2500 /* PM_reg_curpm owns a reference to this regexp. */
2503 PM_SETRE(PL_reg_curpm, rx);
2504 PL_reg_oldcurpm = PL_curpm;
2505 PL_curpm = PL_reg_curpm;
2506 if (RXp_MATCH_COPIED(prog)) {
2507 /* Here is a serious problem: we cannot rewrite subbeg,
2508 since it may be needed if this match fails. Thus
2509 $` inside (?{}) could fail... */
2510 PL_reg_oldsaved = prog->subbeg;
2511 PL_reg_oldsavedlen = prog->sublen;
2512 #ifdef PERL_OLD_COPY_ON_WRITE
2513 PL_nrs = prog->saved_copy;
2515 RXp_MATCH_COPIED_off(prog);
2518 PL_reg_oldsaved = NULL;
2519 prog->subbeg = PL_bostr;
2520 prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */
2522 DEBUG_EXECUTE_r(PL_reg_starttry = *startpos);
2523 prog->offs[0].start = *startpos - PL_bostr;
2524 PL_reginput = *startpos;
2525 PL_reglastparen = &prog->lastparen;
2526 PL_reglastcloseparen = &prog->lastcloseparen;
2527 prog->lastparen = 0;
2528 prog->lastcloseparen = 0;
2530 PL_regoffs = prog->offs;
2531 if (PL_reg_start_tmpl <= prog->nparens) {
2532 PL_reg_start_tmpl = prog->nparens*3/2 + 3;
2533 if(PL_reg_start_tmp)
2534 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2536 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
2539 /* XXXX What this code is doing here?!!! There should be no need
2540 to do this again and again, PL_reglastparen should take care of
2543 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
2544 * Actually, the code in regcppop() (which Ilya may be meaning by
2545 * PL_reglastparen), is not needed at all by the test suite
2546 * (op/regexp, op/pat, op/split), but that code is needed otherwise
2547 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
2548 * Meanwhile, this code *is* needed for the
2549 * above-mentioned test suite tests to succeed. The common theme
2550 * on those tests seems to be returning null fields from matches.
2551 * --jhi updated by dapm */
2553 if (prog->nparens) {
2554 regexp_paren_pair *pp = PL_regoffs;
2556 for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) {
2564 if (regmatch(reginfo, progi->program + 1)) {
2565 PL_regoffs[0].end = PL_reginput - PL_bostr;
2568 if (reginfo->cutpoint)
2569 *startpos= reginfo->cutpoint;
2570 REGCP_UNWIND(lastcp);
2575 #define sayYES goto yes
2576 #define sayNO goto no
2577 #define sayNO_SILENT goto no_silent
2579 /* we dont use STMT_START/END here because it leads to
2580 "unreachable code" warnings, which are bogus, but distracting. */
2581 #define CACHEsayNO \
2582 if (ST.cache_mask) \
2583 PL_reg_poscache[ST.cache_offset] |= ST.cache_mask; \
2586 /* this is used to determine how far from the left messages like
2587 'failed...' are printed. It should be set such that messages
2588 are inline with the regop output that created them.
2590 #define REPORT_CODE_OFF 32
2593 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
2594 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
2596 #define SLAB_FIRST(s) (&(s)->states[0])
2597 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
2599 /* grab a new slab and return the first slot in it */
2601 STATIC regmatch_state *
2604 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2607 regmatch_slab *s = PL_regmatch_slab->next;
2609 Newx(s, 1, regmatch_slab);
2610 s->prev = PL_regmatch_slab;
2612 PL_regmatch_slab->next = s;
2614 PL_regmatch_slab = s;
2615 return SLAB_FIRST(s);
2619 /* push a new state then goto it */
2621 #define PUSH_STATE_GOTO(state, node) \
2623 st->resume_state = state; \
2626 /* push a new state with success backtracking, then goto it */
2628 #define PUSH_YES_STATE_GOTO(state, node) \
2630 st->resume_state = state; \
2631 goto push_yes_state;
2637 regmatch() - main matching routine
2639 This is basically one big switch statement in a loop. We execute an op,
2640 set 'next' to point the next op, and continue. If we come to a point which
2641 we may need to backtrack to on failure such as (A|B|C), we push a
2642 backtrack state onto the backtrack stack. On failure, we pop the top
2643 state, and re-enter the loop at the state indicated. If there are no more
2644 states to pop, we return failure.
2646 Sometimes we also need to backtrack on success; for example /A+/, where
2647 after successfully matching one A, we need to go back and try to
2648 match another one; similarly for lookahead assertions: if the assertion
2649 completes successfully, we backtrack to the state just before the assertion
2650 and then carry on. In these cases, the pushed state is marked as
2651 'backtrack on success too'. This marking is in fact done by a chain of
2652 pointers, each pointing to the previous 'yes' state. On success, we pop to
2653 the nearest yes state, discarding any intermediate failure-only states.
2654 Sometimes a yes state is pushed just to force some cleanup code to be
2655 called at the end of a successful match or submatch; e.g. (??{$re}) uses
2656 it to free the inner regex.
2658 Note that failure backtracking rewinds the cursor position, while
2659 success backtracking leaves it alone.
2661 A pattern is complete when the END op is executed, while a subpattern
2662 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
2663 ops trigger the "pop to last yes state if any, otherwise return true"
2666 A common convention in this function is to use A and B to refer to the two
2667 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
2668 the subpattern to be matched possibly multiple times, while B is the entire
2669 rest of the pattern. Variable and state names reflect this convention.
2671 The states in the main switch are the union of ops and failure/success of
2672 substates associated with with that op. For example, IFMATCH is the op
2673 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
2674 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
2675 successfully matched A and IFMATCH_A_fail is a state saying that we have
2676 just failed to match A. Resume states always come in pairs. The backtrack
2677 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
2678 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
2679 on success or failure.
2681 The struct that holds a backtracking state is actually a big union, with
2682 one variant for each major type of op. The variable st points to the
2683 top-most backtrack struct. To make the code clearer, within each
2684 block of code we #define ST to alias the relevant union.
2686 Here's a concrete example of a (vastly oversimplified) IFMATCH
2692 #define ST st->u.ifmatch
2694 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2695 ST.foo = ...; // some state we wish to save
2697 // push a yes backtrack state with a resume value of
2698 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
2700 PUSH_YES_STATE_GOTO(IFMATCH_A, A);
2703 case IFMATCH_A: // we have successfully executed A; now continue with B
2705 bar = ST.foo; // do something with the preserved value
2708 case IFMATCH_A_fail: // A failed, so the assertion failed
2709 ...; // do some housekeeping, then ...
2710 sayNO; // propagate the failure
2717 For any old-timers reading this who are familiar with the old recursive
2718 approach, the code above is equivalent to:
2720 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
2729 ...; // do some housekeeping, then ...
2730 sayNO; // propagate the failure
2733 The topmost backtrack state, pointed to by st, is usually free. If you
2734 want to claim it, populate any ST.foo fields in it with values you wish to
2735 save, then do one of
2737 PUSH_STATE_GOTO(resume_state, node);
2738 PUSH_YES_STATE_GOTO(resume_state, node);
2740 which sets that backtrack state's resume value to 'resume_state', pushes a
2741 new free entry to the top of the backtrack stack, then goes to 'node'.
2742 On backtracking, the free slot is popped, and the saved state becomes the
2743 new free state. An ST.foo field in this new top state can be temporarily
2744 accessed to retrieve values, but once the main loop is re-entered, it
2745 becomes available for reuse.
2747 Note that the depth of the backtrack stack constantly increases during the
2748 left-to-right execution of the pattern, rather than going up and down with
2749 the pattern nesting. For example the stack is at its maximum at Z at the
2750 end of the pattern, rather than at X in the following:
2752 /(((X)+)+)+....(Y)+....Z/
2754 The only exceptions to this are lookahead/behind assertions and the cut,
2755 (?>A), which pop all the backtrack states associated with A before
2758 Bascktrack state structs are allocated in slabs of about 4K in size.
2759 PL_regmatch_state and st always point to the currently active state,
2760 and PL_regmatch_slab points to the slab currently containing
2761 PL_regmatch_state. The first time regmatch() is called, the first slab is
2762 allocated, and is never freed until interpreter destruction. When the slab
2763 is full, a new one is allocated and chained to the end. At exit from
2764 regmatch(), slabs allocated since entry are freed.
2769 #define DEBUG_STATE_pp(pp) \
2771 DUMP_EXEC_POS(locinput, scan, utf8_target); \
2772 PerlIO_printf(Perl_debug_log, \
2773 " %*s"pp" %s%s%s%s%s\n", \
2775 PL_reg_name[st->resume_state], \
2776 ((st==yes_state||st==mark_state) ? "[" : ""), \
2777 ((st==yes_state) ? "Y" : ""), \
2778 ((st==mark_state) ? "M" : ""), \
2779 ((st==yes_state||st==mark_state) ? "]" : "") \
2784 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
2789 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
2790 const char *start, const char *end, const char *blurb)
2792 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
2794 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
2799 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
2800 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
2802 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
2803 start, end - start, 60);
2805 PerlIO_printf(Perl_debug_log,
2806 "%s%s REx%s %s against %s\n",
2807 PL_colors[4], blurb, PL_colors[5], s0, s1);
2809 if (utf8_target||utf8_pat)
2810 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
2811 utf8_pat ? "pattern" : "",
2812 utf8_pat && utf8_target ? " and " : "",
2813 utf8_target ? "string" : ""
2819 S_dump_exec_pos(pTHX_ const char *locinput,
2820 const regnode *scan,
2821 const char *loc_regeol,
2822 const char *loc_bostr,
2823 const char *loc_reg_starttry,
2824 const bool utf8_target)
2826 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
2827 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
2828 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
2829 /* The part of the string before starttry has one color
2830 (pref0_len chars), between starttry and current
2831 position another one (pref_len - pref0_len chars),
2832 after the current position the third one.
2833 We assume that pref0_len <= pref_len, otherwise we
2834 decrease pref0_len. */
2835 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
2836 ? (5 + taill) - l : locinput - loc_bostr;
2839 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
2841 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
2843 pref0_len = pref_len - (locinput - loc_reg_starttry);
2844 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
2845 l = ( loc_regeol - locinput > (5 + taill) - pref_len
2846 ? (5 + taill) - pref_len : loc_regeol - locinput);
2847 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
2851 if (pref0_len > pref_len)
2852 pref0_len = pref_len;
2854 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
2856 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
2857 (locinput - pref_len),pref0_len, 60, 4, 5);
2859 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
2860 (locinput - pref_len + pref0_len),
2861 pref_len - pref0_len, 60, 2, 3);
2863 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
2864 locinput, loc_regeol - locinput, 10, 0, 1);
2866 const STRLEN tlen=len0+len1+len2;
2867 PerlIO_printf(Perl_debug_log,
2868 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
2869 (IV)(locinput - loc_bostr),
2872 (docolor ? "" : "> <"),
2874 (int)(tlen > 19 ? 0 : 19 - tlen),
2881 /* reg_check_named_buff_matched()
2882 * Checks to see if a named buffer has matched. The data array of
2883 * buffer numbers corresponding to the buffer is expected to reside
2884 * in the regexp->data->data array in the slot stored in the ARG() of
2885 * node involved. Note that this routine doesn't actually care about the
2886 * name, that information is not preserved from compilation to execution.
2887 * Returns the index of the leftmost defined buffer with the given name
2888 * or 0 if non of the buffers matched.
2891 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
2894 RXi_GET_DECL(rex,rexi);
2895 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
2896 I32 *nums=(I32*)SvPVX(sv_dat);
2898 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
2900 for ( n=0; n<SvIVX(sv_dat); n++ ) {
2901 if ((I32)*PL_reglastparen >= nums[n] &&
2902 PL_regoffs[nums[n]].end != -1)
2911 /* free all slabs above current one - called during LEAVE_SCOPE */
2914 S_clear_backtrack_stack(pTHX_ void *p)
2916 regmatch_slab *s = PL_regmatch_slab->next;
2921 PL_regmatch_slab->next = NULL;
2923 regmatch_slab * const osl = s;
2930 #define SETREX(Re1,Re2) \
2931 if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \
2934 STATIC I32 /* 0 failure, 1 success */
2935 S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
2937 #if PERL_VERSION < 9 && !defined(PERL_CORE)
2941 register const bool utf8_target = PL_reg_match_utf8;
2942 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2943 REGEXP *rex_sv = reginfo->prog;
2944 regexp *rex = (struct regexp *)SvANY(rex_sv);
2945 RXi_GET_DECL(rex,rexi);
2947 /* the current state. This is a cached copy of PL_regmatch_state */
2948 register regmatch_state *st;
2949 /* cache heavy used fields of st in registers */
2950 register regnode *scan;
2951 register regnode *next;
2952 register U32 n = 0; /* general value; init to avoid compiler warning */
2953 register I32 ln = 0; /* len or last; init to avoid compiler warning */
2954 register char *locinput = PL_reginput;
2955 register I32 nextchr; /* is always set to UCHARAT(locinput) */
2957 bool result = 0; /* return value of S_regmatch */
2958 int depth = 0; /* depth of backtrack stack */
2959 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
2960 const U32 max_nochange_depth =
2961 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
2962 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
2963 regmatch_state *yes_state = NULL; /* state to pop to on success of
2965 /* mark_state piggy backs on the yes_state logic so that when we unwind
2966 the stack on success we can update the mark_state as we go */
2967 regmatch_state *mark_state = NULL; /* last mark state we have seen */
2968 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
2969 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
2971 bool no_final = 0; /* prevent failure from backtracking? */
2972 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
2973 char *startpoint = PL_reginput;
2974 SV *popmark = NULL; /* are we looking for a mark? */
2975 SV *sv_commit = NULL; /* last mark name seen in failure */
2976 SV *sv_yes_mark = NULL; /* last mark name we have seen
2977 during a successfull match */
2978 U32 lastopen = 0; /* last open we saw */
2979 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
2980 SV* const oreplsv = GvSV(PL_replgv);
2981 /* these three flags are set by various ops to signal information to
2982 * the very next op. They have a useful lifetime of exactly one loop
2983 * iteration, and are not preserved or restored by state pushes/pops
2985 bool sw = 0; /* the condition value in (?(cond)a|b) */
2986 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
2987 int logical = 0; /* the following EVAL is:
2991 or the following IFMATCH/UNLESSM is:
2992 false: plain (?=foo)
2993 true: used as a condition: (?(?=foo))
2996 GET_RE_DEBUG_FLAGS_DECL;
2999 PERL_ARGS_ASSERT_REGMATCH;
3001 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3002 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3004 /* on first ever call to regmatch, allocate first slab */
3005 if (!PL_regmatch_slab) {
3006 Newx(PL_regmatch_slab, 1, regmatch_slab);
3007 PL_regmatch_slab->prev = NULL;
3008 PL_regmatch_slab->next = NULL;
3009 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3012 oldsave = PL_savestack_ix;
3013 SAVEDESTRUCTOR_X(S_clear_backtrack_stack, NULL);
3014 SAVEVPTR(PL_regmatch_slab);
3015 SAVEVPTR(PL_regmatch_state);
3017 /* grab next free state slot */
3018 st = ++PL_regmatch_state;
3019 if (st > SLAB_LAST(PL_regmatch_slab))
3020 st = PL_regmatch_state = S_push_slab(aTHX);
3022 /* Note that nextchr is a byte even in UTF */
3023 nextchr = UCHARAT(locinput);
3025 while (scan != NULL) {
3028 SV * const prop = sv_newmortal();
3029 regnode *rnext=regnext(scan);
3030 DUMP_EXEC_POS( locinput, scan, utf8_target );
3031 regprop(rex, prop, scan);
3033 PerlIO_printf(Perl_debug_log,
3034 "%3"IVdf":%*s%s(%"IVdf")\n",
3035 (IV)(scan - rexi->program), depth*2, "",
3037 (PL_regkind[OP(scan)] == END || !rnext) ?
3038 0 : (IV)(rnext - rexi->program));
3041 next = scan + NEXT_OFF(scan);
3044 state_num = OP(scan);
3046 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3049 assert(PL_reglastparen == &rex->lastparen);
3050 assert(PL_reglastcloseparen == &rex->lastcloseparen);
3051 assert(PL_regoffs == rex->offs);
3053 switch (state_num) {
3055 if (locinput == PL_bostr)
3057 /* reginfo->till = reginfo->bol; */
3062 if (locinput == PL_bostr ||
3063 ((nextchr || locinput < PL_regeol) && locinput[-1] == '\n'))
3069 if (locinput == PL_bostr)
3073 if (locinput == reginfo->ganch)
3078 /* update the startpoint */
3079 st->u.keeper.val = PL_regoffs[0].start;
3080 PL_reginput = locinput;
3081 PL_regoffs[0].start = locinput - PL_bostr;
3082 PUSH_STATE_GOTO(KEEPS_next, next);
3084 case KEEPS_next_fail:
3085 /* rollback the start point change */
3086 PL_regoffs[0].start = st->u.keeper.val;
3092 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3097 if ((nextchr || locinput < PL_regeol) && nextchr != '\n')
3099 if (PL_regeol - locinput > 1)
3103 if (PL_regeol != locinput)
3107 if (!nextchr && locinput >= PL_regeol)
3110 locinput += PL_utf8skip[nextchr];
3111 if (locinput > PL_regeol)
3113 nextchr = UCHARAT(locinput);
3116 nextchr = UCHARAT(++locinput);
3119 if (!nextchr && locinput >= PL_regeol)
3121 nextchr = UCHARAT(++locinput);
3124 if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n')
3127 locinput += PL_utf8skip[nextchr];
3128 if (locinput > PL_regeol)
3130 nextchr = UCHARAT(locinput);
3133 nextchr = UCHARAT(++locinput);
3137 #define ST st->u.trie
3139 /* In this case the charclass data is available inline so
3140 we can fail fast without a lot of extra overhead.
3142 if (scan->flags == EXACT || !utf8_target) {
3143 if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
3145 PerlIO_printf(Perl_debug_log,
3146 "%*s %sfailed to match trie start class...%s\n",
3147 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3155 /* the basic plan of execution of the trie is:
3156 * At the beginning, run though all the states, and
3157 * find the longest-matching word. Also remember the position
3158 * of the shortest matching word. For example, this pattern:
3161 * when matched against the string "abcde", will generate
3162 * accept states for all words except 3, with the longest
3163 * matching word being 4, and the shortest being 1 (with
3164 * the position being after char 1 of the string).
3166 * Then for each matching word, in word order (i.e. 1,2,4,5),
3167 * we run the remainder of the pattern; on each try setting
3168 * the current position to the character following the word,
3169 * returning to try the next word on failure.
3171 * We avoid having to build a list of words at runtime by
3172 * using a compile-time structure, wordinfo[].prev, which
3173 * gives, for each word, the previous accepting word (if any).
3174 * In the case above it would contain the mappings 1->2, 2->0,
3175 * 3->0, 4->5, 5->1. We can use this table to generate, from
3176 * the longest word (4 above), a list of all words, by
3177 * following the list of prev pointers; this gives us the
3178 * unordered list 4,5,1,2. Then given the current word we have
3179 * just tried, we can go through the list and find the
3180 * next-biggest word to try (so if we just failed on word 2,
3181 * the next in the list is 4).
3183 * Since at runtime we don't record the matching position in
3184 * the string for each word, we have to work that out for
3185 * each word we're about to process. The wordinfo table holds
3186 * the character length of each word; given that we recorded
3187 * at the start: the position of the shortest word and its
3188 * length in chars, we just need to move the pointer the
3189 * difference between the two char lengths. Depending on
3190 * Unicode status and folding, that's cheap or expensive.
3192 * This algorithm is optimised for the case where are only a
3193 * small number of accept states, i.e. 0,1, or maybe 2.
3194 * With lots of accepts states, and having to try all of them,
3195 * it becomes quadratic on number of accept states to find all
3200 /* what type of TRIE am I? (utf8 makes this contextual) */
3201 DECL_TRIE_TYPE(scan);
3203 /* what trie are we using right now */
3204 reg_trie_data * const trie
3205 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
3206 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
3207 U32 state = trie->startstate;
3209 if (trie->bitmap && trie_type != trie_utf8_fold &&
3210 !TRIE_BITMAP_TEST(trie,*locinput)
3212 if (trie->states[ state ].wordnum) {
3214 PerlIO_printf(Perl_debug_log,
3215 "%*s %smatched empty string...%s\n",
3216 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3221 PerlIO_printf(Perl_debug_log,
3222 "%*s %sfailed to match trie start class...%s\n",
3223 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
3230 U8 *uc = ( U8* )locinput;
3234 U8 *uscan = (U8*)NULL;
3235 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
3236 U32 charcount = 0; /* how many input chars we have matched */
3237 U32 accepted = 0; /* have we seen any accepting states? */
3240 ST.jump = trie->jump;
3243 ST.longfold = FALSE; /* char longer if folded => it's harder */
3246 /* fully traverse the TRIE; note the position of the
3247 shortest accept state and the wordnum of the longest
3250 while ( state && uc <= (U8*)PL_regeol ) {
3251 U32 base = trie->states[ state ].trans.base;
3255 wordnum = trie->states[ state ].wordnum;
3257 if (wordnum) { /* it's an accept state */
3260 /* record first match position */
3262 ST.firstpos = (U8*)locinput;
3267 ST.firstchars = charcount;
3270 if (!ST.nextword || wordnum < ST.nextword)
3271 ST.nextword = wordnum;
3272 ST.topword = wordnum;
3275 DEBUG_TRIE_EXECUTE_r({
3276 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
3277 PerlIO_printf( Perl_debug_log,
3278 "%*s %sState: %4"UVxf" Accepted: %c ",
3279 2+depth * 2, "", PL_colors[4],
3280 (UV)state, (accepted ? 'Y' : 'N'));
3283 /* read a char and goto next state */
3286 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
3287 uscan, len, uvc, charid, foldlen,
3294 base + charid - 1 - trie->uniquecharcount)) >= 0)
3296 && ((U32)offset < trie->lasttrans)
3297 && trie->trans[offset].check == state)
3299 state = trie->trans[offset].next;
3310 DEBUG_TRIE_EXECUTE_r(
3311 PerlIO_printf( Perl_debug_log,
3312 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
3313 charid, uvc, (UV)state, PL_colors[5] );
3319 /* calculate total number of accept states */
3324 w = trie->wordinfo[w].prev;
3327 ST.accepted = accepted;
3331 PerlIO_printf( Perl_debug_log,
3332 "%*s %sgot %"IVdf" possible matches%s\n",
3333 REPORT_CODE_OFF + depth * 2, "",
3334 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
3336 goto trie_first_try; /* jump into the fail handler */
3340 case TRIE_next_fail: /* we failed - try next alternative */
3342 REGCP_UNWIND(ST.cp);
3343 for (n = *PL_reglastparen; n > ST.lastparen; n--)
3344 PL_regoffs[n].end = -1;
3345 *PL_reglastparen = n;
3347 if (!--ST.accepted) {
3349 PerlIO_printf( Perl_debug_log,
3350 "%*s %sTRIE failed...%s\n",
3351 REPORT_CODE_OFF+depth*2, "",
3358 /* Find next-highest word to process. Note that this code
3359 * is O(N^2) per trie run (O(N) per branch), so keep tight */
3360 register U16 min = 0;
3362 register U16 const nextword = ST.nextword;
3363 register reg_trie_wordinfo * const wordinfo
3364 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
3365 for (word=ST.topword; word; word=wordinfo[word].prev) {
3366 if (word > nextword && (!min || word < min))
3379 ST.lastparen = *PL_reglastparen;
3383 /* find start char of end of current word */
3385 U32 chars; /* how many chars to skip */
3386 U8 *uc = ST.firstpos;
3387 reg_trie_data * const trie
3388 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
3390 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
3392 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
3396 /* the hard option - fold each char in turn and find
3397 * its folded length (which may be different */
3398 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
3406 uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len,
3414 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
3419 uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len,
3433 PL_reginput = (char *)uc;
3436 scan = (ST.jump && ST.jump[ST.nextword])
3437 ? ST.me + ST.jump[ST.nextword]
3441 PerlIO_printf( Perl_debug_log,
3442 "%*s %sTRIE matched word #%d, continuing%s\n",
3443 REPORT_CODE_OFF+depth*2, "",
3450 if (ST.accepted > 1 || has_cutgroup) {
3451 PUSH_STATE_GOTO(TRIE_next, scan);
3454 /* only one choice left - just continue */
3456 AV *const trie_words
3457 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
3458 SV ** const tmp = av_fetch( trie_words,
3460 SV *sv= tmp ? sv_newmortal() : NULL;
3462 PerlIO_printf( Perl_debug_log,
3463 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
3464 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
3466 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
3467 PL_colors[0], PL_colors[1],
3468 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
3470 : "not compiled under -Dr",
3474 locinput = PL_reginput;
3475 nextchr = UCHARAT(locinput);
3476 continue; /* execute rest of RE */
3481 char *s = STRING(scan);
3483 if (utf8_target != UTF_PATTERN) {
3484 /* The target and the pattern have differing utf8ness. */
3486 const char * const e = s + ln;
3489 /* The target is utf8, the pattern is not utf8. */
3494 if (NATIVE_TO_UNI(*(U8*)s) !=
3495 utf8n_to_uvuni((U8*)l, UTF8_MAXBYTES, &ulen,
3503 /* The target is not utf8, the pattern is utf8. */
3508 if (NATIVE_TO_UNI(*((U8*)l)) !=
3509 utf8n_to_uvuni((U8*)s, UTF8_MAXBYTES, &ulen,
3517 nextchr = UCHARAT(locinput);
3520 /* The target and the pattern have the same utf8ness. */
3521 /* Inline the first character, for speed. */
3522 if (UCHARAT(s) != nextchr)
3524 if (PL_regeol - locinput < ln)
3526 if (ln > 1 && memNE(s, locinput, ln))
3529 nextchr = UCHARAT(locinput);
3533 PL_reg_flags |= RF_tainted;
3536 char * const s = STRING(scan);
3539 if (utf8_target || UTF_PATTERN) {
3540 /* Either target or the pattern are utf8. */
3541 const char * const l = locinput;
3542 char *e = PL_regeol;
3544 if (! foldEQ_utf8(s, 0, ln, cBOOL(UTF_PATTERN),
3545 l, &e, 0, utf8_target)) {
3546 /* One more case for the sharp s:
3547 * pack("U0U*", 0xDF) =~ /ss/i,
3548 * the 0xC3 0x9F are the UTF-8
3549 * byte sequence for the U+00DF. */
3551 if (!(utf8_target &&
3552 toLOWER(s[0]) == 's' &&
3554 toLOWER(s[1]) == 's' &&
3561 nextchr = UCHARAT(locinput);
3565 /* Neither the target and the pattern are utf8. */
3567 /* Inline the first character, for speed. */
3568 if (UCHARAT(s) != nextchr &&
3569 UCHARAT(s) != ((OP(scan) == EXACTF)
3570 ? PL_fold : PL_fold_locale)[nextchr])
3572 if (PL_regeol - locinput < ln)
3574 if (ln > 1 && (OP(scan) == EXACTF
3575 ? ! foldEQ(s, locinput, ln)
3576 : ! foldEQ_locale(s, locinput, ln)))
3579 nextchr = UCHARAT(locinput);
3584 PL_reg_flags |= RF_tainted;
3588 /* was last char in word? */
3590 if (locinput == PL_bostr)
3593 const U8 * const r = reghop3((U8*)locinput, -1, (U8*)PL_bostr);
3595 ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
3597 if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3598 ln = isALNUM_uni(ln);
3599 LOAD_UTF8_CHARCLASS_ALNUM();
3600 n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target);
3603 ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
3604 n = isALNUM_LC_utf8((U8*)locinput);
3608 ln = (locinput != PL_bostr) ?
3609 UCHARAT(locinput - 1) : '\n';
3610 if (FLAGS(scan) & USE_UNI) {
3612 /* Here, can't be BOUNDL or NBOUNDL because they never set
3613 * the flags to USE_UNI */
3614 ln = isWORDCHAR_L1(ln);
3615 n = isWORDCHAR_L1(nextchr);
3617 else if (OP(scan) == BOUND || OP(scan) == NBOUND) {
3619 n = isALNUM(nextchr);
3622 ln = isALNUM_LC(ln);
3623 n = isALNUM_LC(nextchr);
3626 if (((!ln) == (!n)) == (OP(scan) == BOUND ||
3627 OP(scan) == BOUNDL))
3632 STRLEN inclasslen = PL_regeol - locinput;
3634 if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target))
3636 if (locinput >= PL_regeol)
3638 locinput += inclasslen ? inclasslen : UTF8SKIP(locinput);
3639 nextchr = UCHARAT(locinput);
3644 nextchr = UCHARAT(locinput);
3645 if (!REGINCLASS(rex, scan, (U8*)locinput))
3647 if (!nextchr && locinput >= PL_regeol)
3649 nextchr = UCHARAT(++locinput);
3653 /* If we might have the case of the German sharp s
3654 * in a casefolding Unicode character class. */
3656 if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) {
3657 locinput += SHARP_S_SKIP;
3658 nextchr = UCHARAT(locinput);
3663 /* Special char classes - The defines start on line 129 or so */
3664 CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3665 CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC);
3667 CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3668 CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC);
3670 CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3671 CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC);
3673 case CLUMP: /* Match \X: logical Unicode character. This is defined as
3674 a Unicode extended Grapheme Cluster */
3675 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
3676 extended Grapheme Cluster is:
3679 | Prepend* Begin Extend*
3682 Begin is (Hangul-syllable | ! Control)
3683 Extend is (Grapheme_Extend | Spacing_Mark)
3684 Control is [ GCB_Control CR LF ]
3686 The discussion below shows how the code for CLUMP is derived
3687 from this regex. Note that most of these concepts are from
3688 property values of the Grapheme Cluster Boundary (GCB) property.
3689 No code point can have multiple property values for a given
3690 property. Thus a code point in Prepend can't be in Control, but
3691 it must be in !Control. This is why Control above includes
3692 GCB_Control plus CR plus LF. The latter two are used in the GCB
3693 property separately, and so can't be in GCB_Control, even though
3694 they logically are controls. Control is not the same as gc=cc,
3695 but includes format and other characters as well.
3697 The Unicode definition of Hangul-syllable is:
3699 | (L* ( ( V | LV ) V* | LVT ) T*)
3702 Each of these is a value for the GCB property, and hence must be
3703 disjoint, so the order they are tested is immaterial, so the
3704 above can safely be changed to
3707 | (L* ( LVT | ( V | LV ) V*) T*)
3709 The last two terms can be combined like this:
3711 | (( LVT | ( V | LV ) V*) T*))
3713 And refactored into this:
3714 L* (L | LVT T* | V V* T* | LV V* T*)
3716 That means that if we have seen any L's at all we can quit
3717 there, but if the next character is a LVT, a V or and LV we
3720 There is a subtlety with Prepend* which showed up in testing.
3721 Note that the Begin, and only the Begin is required in:
3722 | Prepend* Begin Extend*
3723 Also, Begin contains '! Control'. A Prepend must be a '!
3724 Control', which means it must be a Begin. What it comes down to
3725 is that if we match Prepend* and then find no suitable Begin
3726 afterwards, that if we backtrack the last Prepend, that one will
3727 be a suitable Begin.
3730 if (locinput >= PL_regeol)
3732 if (! utf8_target) {
3734 /* Match either CR LF or '.', as all the other possibilities
3736 locinput++; /* Match the . or CR */
3738 && locinput < PL_regeol
3739 && UCHARAT(locinput) == '\n') locinput++;
3743 /* Utf8: See if is ( CR LF ); already know that locinput <
3744 * PL_regeol, so locinput+1 is in bounds */
3745 if (nextchr == '\r' && UCHARAT(locinput + 1) == '\n') {
3749 /* In case have to backtrack to beginning, then match '.' */
3750 char *starting = locinput;
3752 /* In case have to backtrack the last prepend */
3753 char *previous_prepend = 0;
3755 LOAD_UTF8_CHARCLASS_GCB();
3757 /* Match (prepend)* */
3758 while (locinput < PL_regeol
3759 && swash_fetch(PL_utf8_X_prepend,
3760 (U8*)locinput, utf8_target))
3762 previous_prepend = locinput;
3763 locinput += UTF8SKIP(locinput);
3766 /* As noted above, if we matched a prepend character, but
3767 * the next thing won't match, back off the last prepend we
3768 * matched, as it is guaranteed to match the begin */
3769 if (previous_prepend
3770 && (locinput >= PL_regeol
3771 || ! swash_fetch(PL_utf8_X_begin,
3772 (U8*)locinput, utf8_target)))
3774 locinput = previous_prepend;
3777 /* Note that here we know PL_regeol > locinput, as we
3778 * tested that upon input to this switch case, and if we
3779 * moved locinput forward, we tested the result just above
3780 * and it either passed, or we backed off so that it will
3782 if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) {
3784 /* Here did not match the required 'Begin' in the
3785 * second term. So just match the very first
3786 * character, the '.' of the final term of the regex */
3787 locinput = starting + UTF8SKIP(starting);
3790 /* Here is the beginning of a character that can have
3791 * an extender. It is either a hangul syllable, or a
3793 if (swash_fetch(PL_utf8_X_non_hangul,
3794 (U8*)locinput, utf8_target))
3797 /* Here not a Hangul syllable, must be a
3798 * ('! * Control') */
3799 locinput += UTF8SKIP(locinput);
3802 /* Here is a Hangul syllable. It can be composed
3803 * of several individual characters. One
3804 * possibility is T+ */
3805 if (swash_fetch(PL_utf8_X_T,
3806 (U8*)locinput, utf8_target))
3808 while (locinput < PL_regeol
3809 && swash_fetch(PL_utf8_X_T,
3810 (U8*)locinput, utf8_target))
3812 locinput += UTF8SKIP(locinput);
3816 /* Here, not T+, but is a Hangul. That means
3817 * it is one of the others: L, LV, LVT or V,
3819 * L* (L | LVT T* | V V* T* | LV V* T*) */
3822 while (locinput < PL_regeol
3823 && swash_fetch(PL_utf8_X_L,
3824 (U8*)locinput, utf8_target))
3826 locinput += UTF8SKIP(locinput);
3829 /* Here, have exhausted L*. If the next
3830 * character is not an LV, LVT nor V, it means
3831 * we had to have at least one L, so matches L+
3832 * in the original equation, we have a complete
3833 * hangul syllable. Are done. */
3835 if (locinput < PL_regeol
3836 && swash_fetch(PL_utf8_X_LV_LVT_V,
3837 (U8*)locinput, utf8_target))
3840 /* Otherwise keep going. Must be LV, LVT
3841 * or V. See if LVT */
3842 if (swash_fetch(PL_utf8_X_LVT,
3843 (U8*)locinput, utf8_target))
3845 locinput += UTF8SKIP(locinput);
3848 /* Must be V or LV. Take it, then
3850 locinput += UTF8SKIP(locinput);
3851 while (locinput < PL_regeol
3852 && swash_fetch(PL_utf8_X_V,
3853 (U8*)locinput, utf8_target))
3855 locinput += UTF8SKIP(locinput);
3859 /* And any of LV, LVT, or V can be followed
3861 while (locinput < PL_regeol
3862 && swash_fetch(PL_utf8_X_T,
3866 locinput += UTF8SKIP(locinput);
3872 /* Match any extender */
3873 while (locinput < PL_regeol
3874 && swash_fetch(PL_utf8_X_extend,
3875 (U8*)locinput, utf8_target))
3877 locinput += UTF8SKIP(locinput);
3881 if (locinput > PL_regeol) sayNO;
3883 nextchr = UCHARAT(locinput);
3890 PL_reg_flags |= RF_tainted;
3895 n = reg_check_named_buff_matched(rex,scan);
3898 type = REF + ( type - NREF );
3905 PL_reg_flags |= RF_tainted;
3909 n = ARG(scan); /* which paren pair */
3912 ln = PL_regoffs[n].start;
3913 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
3914 if (*PL_reglastparen < n || ln == -1)
3915 sayNO; /* Do not match unless seen CLOSEn. */
3916 if (ln == PL_regoffs[n].end)
3920 if (utf8_target && type != REF) { /* REF can do byte comparison */
3922 const char *e = PL_bostr + PL_regoffs[n].end;
3924 * Note that we can't do the "other character" lookup trick as
3925 * in the 8-bit case (no pun intended) because in Unicode we
3926 * have to map both upper and title case to lower case.
3930 STRLEN ulen1, ulen2;
3931 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
3932 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
3936 toLOWER_utf8((U8*)s, tmpbuf1, &ulen1);
3937 toLOWER_utf8((U8*)l, tmpbuf2, &ulen2);
3938 if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1))
3945 nextchr = UCHARAT(locinput);
3949 /* Inline the first character, for speed. */
3950 if (UCHARAT(s) != nextchr &&
3952 (UCHARAT(s) != (type == REFF
3953 ? PL_fold : PL_fold_locale)[nextchr])))
3955 ln = PL_regoffs[n].end - ln;
3956 if (locinput + ln > PL_regeol)
3958 if (ln > 1 && (type == REF
3959 ? memNE(s, locinput, ln)
3961 ? ! foldEQ(s, locinput, ln)
3962 : ! foldEQ_locale(s, locinput, ln))))
3965 nextchr = UCHARAT(locinput);
3975 #define ST st->u.eval
3980 regexp_internal *rei;
3981 regnode *startpoint;
3984 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
3985 if (cur_eval && cur_eval->locinput==locinput) {
3986 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
3987 Perl_croak(aTHX_ "Infinite recursion in regex");
3988 if ( ++nochange_depth > max_nochange_depth )
3990 "Pattern subroutine nesting without pos change"
3991 " exceeded limit in regex");
3998 (void)ReREFCNT_inc(rex_sv);
3999 if (OP(scan)==GOSUB) {
4000 startpoint = scan + ARG2L(scan);
4001 ST.close_paren = ARG(scan);
4003 startpoint = rei->program+1;
4006 goto eval_recurse_doit;
4008 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
4009 if (cur_eval && cur_eval->locinput==locinput) {
4010 if ( ++nochange_depth > max_nochange_depth )
4011 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
4016 /* execute the code in the {...} */
4018 SV ** const before = SP;
4019 OP_4tree * const oop = PL_op;
4020 COP * const ocurcop = PL_curcop;
4022 char *saved_regeol = PL_regeol;
4023 struct re_save_state saved_state;
4025 /* To not corrupt the existing regex state while executing the
4026 * eval we would normally put it on the save stack, like with
4027 * save_re_context. However, re-evals have a weird scoping so we
4028 * can't just add ENTER/LEAVE here. With that, things like
4030 * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a})
4032 * would break, as they expect the localisation to be unwound
4033 * only when the re-engine backtracks through the bit that
4036 * What we do instead is just saving the state in a local c
4039 Copy(&PL_reg_state, &saved_state, 1, struct re_save_state);
4042 PL_op = (OP_4tree*)rexi->data->data[n];
4043 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
4044 " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) );
4045 PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]);
4046 PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr;
4049 SV *sv_mrk = get_sv("REGMARK", 1);
4050 sv_setsv(sv_mrk, sv_yes_mark);
4053 CALLRUNOPS(aTHX); /* Scalar context. */
4056 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
4062 Copy(&saved_state, &PL_reg_state, 1, struct re_save_state);
4065 PAD_RESTORE_LOCAL(old_comppad);
4066 PL_curcop = ocurcop;
4067 PL_regeol = saved_regeol;
4070 sv_setsv(save_scalar(PL_replgv), ret);
4074 if (logical == 2) { /* Postponed subexpression: /(??{...})/ */
4077 /* extract RE object from returned value; compiling if
4083 SV *const sv = SvRV(ret);
4085 if (SvTYPE(sv) == SVt_REGEXP) {
4087 } else if (SvSMAGICAL(sv)) {
4088 mg = mg_find(sv, PERL_MAGIC_qr);
4091 } else if (SvTYPE(ret) == SVt_REGEXP) {
4093 } else if (SvSMAGICAL(ret)) {
4094 if (SvGMAGICAL(ret)) {
4095 /* I don't believe that there is ever qr magic
4097 assert(!mg_find(ret, PERL_MAGIC_qr));
4098 sv_unmagic(ret, PERL_MAGIC_qr);
4101 mg = mg_find(ret, PERL_MAGIC_qr);
4102 /* testing suggests mg only ends up non-NULL for
4103 scalars who were upgraded and compiled in the
4104 else block below. In turn, this is only
4105 triggered in the "postponed utf8 string" tests
4111 rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/
4115 rx = reg_temp_copy(NULL, rx);
4119 const I32 osize = PL_regsize;
4122 assert (SvUTF8(ret));
4123 } else if (SvUTF8(ret)) {
4124 /* Not doing UTF-8, despite what the SV says. Is
4125 this only if we're trapped in use 'bytes'? */
4126 /* Make a copy of the octet sequence, but without
4127 the flag on, as the compiler now honours the
4128 SvUTF8 flag on ret. */
4130 const char *const p = SvPV(ret, len);
4131 ret = newSVpvn_flags(p, len, SVs_TEMP);
4133 rx = CALLREGCOMP(ret, pm_flags);
4135 & (SVs_TEMP | SVs_PADTMP | SVf_READONLY
4137 /* This isn't a first class regexp. Instead, it's
4138 caching a regexp onto an existing, Perl visible
4140 sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0);
4145 re = (struct regexp *)SvANY(rx);
4147 RXp_MATCH_COPIED_off(re);
4148 re->subbeg = rex->subbeg;
4149 re->sublen = rex->sublen;
4152 debug_start_match(re_sv, utf8_target, locinput, PL_regeol,
4153 "Matching embedded");
4155 startpoint = rei->program + 1;
4156 ST.close_paren = 0; /* only used for GOSUB */
4157 /* borrowed from regtry */
4158 if (PL_reg_start_tmpl <= re->nparens) {
4159 PL_reg_start_tmpl = re->nparens*3/2 + 3;
4160 if(PL_reg_start_tmp)
4161 Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4163 Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*);
4166 eval_recurse_doit: /* Share code with GOSUB below this line */
4167 /* run the pattern returned from (??{...}) */
4168 ST.cp = regcppush(0); /* Save *all* the positions. */
4169 REGCP_SET(ST.lastcp);
4171 PL_regoffs = re->offs; /* essentially NOOP on GOSUB */
4173 /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */
4174 PL_reglastparen = &re->lastparen;
4175 PL_reglastcloseparen = &re->lastcloseparen;
4177 re->lastcloseparen = 0;
4179 PL_reginput = locinput;
4182 /* XXXX This is too dramatic a measure... */
4185 ST.toggle_reg_flags = PL_reg_flags;
4187 PL_reg_flags |= RF_utf8;
4189 PL_reg_flags &= ~RF_utf8;
4190 ST.toggle_reg_flags ^= PL_reg_flags; /* diff of old and new */
4192 ST.prev_rex = rex_sv;
4193 ST.prev_curlyx = cur_curlyx;
4194 SETREX(rex_sv,re_sv);
4199 ST.prev_eval = cur_eval;
4201 /* now continue from first node in postoned RE */
4202 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint);
4205 /* logical is 1, /(?(?{...})X|Y)/ */
4206 sw = cBOOL(SvTRUE(ret));
4211 case EVAL_AB: /* cleanup after a successful (??{A})B */
4212 /* note: this is called twice; first after popping B, then A */
4213 PL_reg_flags ^= ST.toggle_reg_flags;
4214 ReREFCNT_dec(rex_sv);
4215 SETREX(rex_sv,ST.prev_rex);
4216 rex = (struct regexp *)SvANY(rex_sv);
4217 rexi = RXi_GET(rex);
4219 cur_eval = ST.prev_eval;
4220 cur_curlyx = ST.prev_curlyx;
4222 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4223 PL_reglastparen = &rex->lastparen;
4224 PL_reglastcloseparen = &rex->lastcloseparen;
4225 /* also update PL_regoffs */
4226 PL_regoffs = rex->offs;
4228 /* XXXX This is too dramatic a measure... */
4230 if ( nochange_depth )
4235 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
4236 /* note: this is called twice; first after popping B, then A */
4237 PL_reg_flags ^= ST.toggle_reg_flags;
4238 ReREFCNT_dec(rex_sv);
4239 SETREX(rex_sv,ST.prev_rex);
4240 rex = (struct regexp *)SvANY(rex_sv);
4241 rexi = RXi_GET(rex);
4242 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
4243 PL_reglastparen = &rex->lastparen;
4244 PL_reglastcloseparen = &rex->lastcloseparen;
4246 PL_reginput = locinput;
4247 REGCP_UNWIND(ST.lastcp);
4249 cur_eval = ST.prev_eval;
4250 cur_curlyx = ST.prev_curlyx;
4251 /* XXXX This is too dramatic a measure... */
4253 if ( nochange_depth )
4259 n = ARG(scan); /* which paren pair */
4260 PL_reg_start_tmp[n] = locinput;
4266 n = ARG(scan); /* which paren pair */
4267 PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr;
4268 PL_regoffs[n].end = locinput - PL_bostr;
4269 /*if (n > PL_regsize)
4271 if (n > *PL_reglastparen)
4272 *PL_reglastparen = n;
4273 *PL_reglastcloseparen = n;
4274 if (cur_eval && cur_eval->u.eval.close_paren == n) {
4282 cursor && OP(cursor)!=END;
4283 cursor=regnext(cursor))
4285 if ( OP(cursor)==CLOSE ){
4287 if ( n <= lastopen ) {
4289 = PL_reg_start_tmp[n] - PL_bostr;
4290 PL_regoffs[n].end = locinput - PL_bostr;
4291 /*if (n > PL_regsize)
4293 if (n > *PL_reglastparen)
4294 *PL_reglastparen = n;
4295 *PL_reglastcloseparen = n;
4296 if ( n == ARG(scan) || (cur_eval &&
4297 cur_eval->u.eval.close_paren == n))
4306 n = ARG(scan); /* which paren pair */
4307 sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1);
4310 /* reg_check_named_buff_matched returns 0 for no match */
4311 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
4315 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
4321 PL_reg_leftiter = PL_reg_maxiter; /* Void cache */
4323 next = NEXTOPER(NEXTOPER(scan));
4325 next = scan + ARG(scan);
4326 if (OP(next) == IFTHEN) /* Fake one. */
4327 next = NEXTOPER(NEXTOPER(next));
4331 logical = scan->flags;
4334 /*******************************************************************
4336 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
4337 pattern, where A and B are subpatterns. (For simple A, CURLYM or
4338 STAR/PLUS/CURLY/CURLYN are used instead.)
4340 A*B is compiled as <CURLYX><A><WHILEM><B>
4342 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
4343 state, which contains the current count, initialised to -1. It also sets
4344 cur_curlyx to point to this state, with any previous value saved in the
4347 CURLYX then jumps straight to the WHILEM op, rather than executing A,
4348 since the pattern may possibly match zero times (i.e. it's a while {} loop
4349 rather than a do {} while loop).
4351 Each entry to WHILEM represents a successful match of A. The count in the
4352 CURLYX block is incremented, another WHILEM state is pushed, and execution
4353 passes to A or B depending on greediness and the current count.
4355 For example, if matching against the string a1a2a3b (where the aN are
4356 substrings that match /A/), then the match progresses as follows: (the
4357 pushed states are interspersed with the bits of strings matched so far):
4360 <CURLYX cnt=0><WHILEM>
4361 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
4362 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
4363 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
4364 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
4366 (Contrast this with something like CURLYM, which maintains only a single
4370 a1 <CURLYM cnt=1> a2
4371 a1 a2 <CURLYM cnt=2> a3
4372 a1 a2 a3 <CURLYM cnt=3> b
4375 Each WHILEM state block marks a point to backtrack to upon partial failure
4376 of A or B, and also contains some minor state data related to that
4377 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
4378 overall state, such as the count, and pointers to the A and B ops.
4380 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
4381 must always point to the *current* CURLYX block, the rules are:
4383 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
4384 and set cur_curlyx to point the new block.
4386 When popping the CURLYX block after a successful or unsuccessful match,
4387 restore the previous cur_curlyx.
4389 When WHILEM is about to execute B, save the current cur_curlyx, and set it
4390 to the outer one saved in the CURLYX block.
4392 When popping the WHILEM block after a successful or unsuccessful B match,
4393 restore the previous cur_curlyx.
4395 Here's an example for the pattern (AI* BI)*BO
4396 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
4399 curlyx backtrack stack
4400 ------ ---------------
4402 CO <CO prev=NULL> <WO>
4403 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4404 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4405 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
4407 At this point the pattern succeeds, and we work back down the stack to
4408 clean up, restoring as we go:
4410 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
4411 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
4412 CO <CO prev=NULL> <WO>
4415 *******************************************************************/
4417 #define ST st->u.curlyx
4419 case CURLYX: /* start of /A*B/ (for complex A) */
4421 /* No need to save/restore up to this paren */
4422 I32 parenfloor = scan->flags;
4424 assert(next); /* keep Coverity happy */
4425 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
4428 /* XXXX Probably it is better to teach regpush to support
4429 parenfloor > PL_regsize... */
4430 if (parenfloor > (I32)*PL_reglastparen)
4431 parenfloor = *PL_reglastparen; /* Pessimization... */
4433 ST.prev_curlyx= cur_curlyx;
4435 ST.cp = PL_savestack_ix;
4437 /* these fields contain the state of the current curly.
4438 * they are accessed by subsequent WHILEMs */
4439 ST.parenfloor = parenfloor;
4444 ST.count = -1; /* this will be updated by WHILEM */
4445 ST.lastloc = NULL; /* this will be updated by WHILEM */
4447 PL_reginput = locinput;
4448 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next));
4452 case CURLYX_end: /* just finished matching all of A*B */
4453 cur_curlyx = ST.prev_curlyx;
4457 case CURLYX_end_fail: /* just failed to match all of A*B */
4459 cur_curlyx = ST.prev_curlyx;
4465 #define ST st->u.whilem
4467 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
4469 /* see the discussion above about CURLYX/WHILEM */
4471 int min = ARG1(cur_curlyx->u.curlyx.me);
4472 int max = ARG2(cur_curlyx->u.curlyx.me);
4473 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
4475 assert(cur_curlyx); /* keep Coverity happy */
4476 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
4477 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
4478 ST.cache_offset = 0;
4481 PL_reginput = locinput;
4483 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4484 "%*s whilem: matched %ld out of %d..%d\n",
4485 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
4488 /* First just match a string of min A's. */
4491 cur_curlyx->u.curlyx.lastloc = locinput;
4492 PUSH_STATE_GOTO(WHILEM_A_pre, A);
4496 /* If degenerate A matches "", assume A done. */
4498 if (locinput == cur_curlyx->u.curlyx.lastloc) {
4499 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4500 "%*s whilem: empty match detected, trying continuation...\n",
4501 REPORT_CODE_OFF+depth*2, "")
4503 goto do_whilem_B_max;
4506 /* super-linear cache processing */
4510 if (!PL_reg_maxiter) {
4511 /* start the countdown: Postpone detection until we
4512 * know the match is not *that* much linear. */
4513 PL_reg_maxiter = (PL_regeol - PL_bostr + 1) * (scan->flags>>4);
4514 /* possible overflow for long strings and many CURLYX's */
4515 if (PL_reg_maxiter < 0)
4516 PL_reg_maxiter = I32_MAX;
4517 PL_reg_leftiter = PL_reg_maxiter;
4520 if (PL_reg_leftiter-- == 0) {
4521 /* initialise cache */
4522 const I32 size = (PL_reg_maxiter + 7)/8;
4523 if (PL_reg_poscache) {
4524 if ((I32)PL_reg_poscache_size < size) {
4525 Renew(PL_reg_poscache, size, char);
4526 PL_reg_poscache_size = size;
4528 Zero(PL_reg_poscache, size, char);
4531 PL_reg_poscache_size = size;
4532 Newxz(PL_reg_poscache, size, char);
4534 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4535 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
4536 PL_colors[4], PL_colors[5])
4540 if (PL_reg_leftiter < 0) {
4541 /* have we already failed at this position? */
4543 offset = (scan->flags & 0xf) - 1
4544 + (locinput - PL_bostr) * (scan->flags>>4);
4545 mask = 1 << (offset % 8);
4547 if (PL_reg_poscache[offset] & mask) {
4548 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
4549 "%*s whilem: (cache) already tried at this position...\n",
4550 REPORT_CODE_OFF+depth*2, "")
4552 sayNO; /* cache records failure */
4554 ST.cache_offset = offset;
4555 ST.cache_mask = mask;
4559 /* Prefer B over A for minimal matching. */
4561 if (cur_curlyx->u.curlyx.minmod) {
4562 ST.save_curlyx = cur_curlyx;
4563 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4564 ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor);
4565 REGCP_SET(ST.lastcp);
4566 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B);
4570 /* Prefer A over B for maximal matching. */
4572 if (n < max) { /* More greed allowed? */
4573 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4574 cur_curlyx->u.curlyx.lastloc = locinput;
4575 REGCP_SET(ST.lastcp);
4576 PUSH_STATE_GOTO(WHILEM_A_max, A);
4579 goto do_whilem_B_max;
4583 case WHILEM_B_min: /* just matched B in a minimal match */
4584 case WHILEM_B_max: /* just matched B in a maximal match */
4585 cur_curlyx = ST.save_curlyx;
4589 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
4590 cur_curlyx = ST.save_curlyx;
4591 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4592 cur_curlyx->u.curlyx.count--;
4596 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
4597 REGCP_UNWIND(ST.lastcp);
4600 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
4601 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
4602 cur_curlyx->u.curlyx.count--;
4606 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
4607 REGCP_UNWIND(ST.lastcp);
4608 regcppop(rex); /* Restore some previous $<digit>s? */
4609 PL_reginput = locinput;
4610 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4611 "%*s whilem: failed, trying continuation...\n",
4612 REPORT_CODE_OFF+depth*2, "")
4615 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4616 && ckWARN(WARN_REGEXP)
4617 && !(PL_reg_flags & RF_warned))
4619 PL_reg_flags |= RF_warned;
4620 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded",
4621 "Complex regular subexpression recursion",
4626 ST.save_curlyx = cur_curlyx;
4627 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
4628 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B);
4631 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
4632 cur_curlyx = ST.save_curlyx;
4633 REGCP_UNWIND(ST.lastcp);
4636 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
4637 /* Maximum greed exceeded */
4638 if (cur_curlyx->u.curlyx.count >= REG_INFTY
4639 && ckWARN(WARN_REGEXP)
4640 && !(PL_reg_flags & RF_warned))
4642 PL_reg_flags |= RF_warned;
4643 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
4644 "%s limit (%d) exceeded",
4645 "Complex regular subexpression recursion",
4648 cur_curlyx->u.curlyx.count--;
4652 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
4653 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
4655 /* Try grabbing another A and see if it helps. */
4656 PL_reginput = locinput;
4657 cur_curlyx->u.curlyx.lastloc = locinput;
4658 ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor);
4659 REGCP_SET(ST.lastcp);
4660 PUSH_STATE_GOTO(WHILEM_A_min,
4661 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS);
4665 #define ST st->u.branch
4667 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
4668 next = scan + ARG(scan);
4671 scan = NEXTOPER(scan);
4674 case BRANCH: /* /(...|A|...)/ */
4675 scan = NEXTOPER(scan); /* scan now points to inner node */
4676 ST.lastparen = *PL_reglastparen;
4677 ST.next_branch = next;
4679 PL_reginput = locinput;
4681 /* Now go into the branch */
4683 PUSH_YES_STATE_GOTO(BRANCH_next, scan);
4685 PUSH_STATE_GOTO(BRANCH_next, scan);
4689 PL_reginput = locinput;
4690 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
4691 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
4692 PUSH_STATE_GOTO(CUTGROUP_next,next);
4694 case CUTGROUP_next_fail:
4697 if (st->u.mark.mark_name)
4698 sv_commit = st->u.mark.mark_name;
4704 case BRANCH_next_fail: /* that branch failed; try the next, if any */
4709 REGCP_UNWIND(ST.cp);
4710 for (n = *PL_reglastparen; n > ST.lastparen; n--)
4711 PL_regoffs[n].end = -1;
4712 *PL_reglastparen = n;
4713 /*dmq: *PL_reglastcloseparen = n; */
4714 scan = ST.next_branch;
4715 /* no more branches? */
4716 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
4718 PerlIO_printf( Perl_debug_log,
4719 "%*s %sBRANCH failed...%s\n",
4720 REPORT_CODE_OFF+depth*2, "",
4726 continue; /* execute next BRANCH[J] op */
4734 #define ST st->u.curlym
4736 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
4738 /* This is an optimisation of CURLYX that enables us to push
4739 * only a single backtracking state, no matter how many matches
4740 * there are in {m,n}. It relies on the pattern being constant
4741 * length, with no parens to influence future backrefs
4745 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4747 /* if paren positive, emulate an OPEN/CLOSE around A */
4749 U32 paren = ST.me->flags;
4750 if (paren > PL_regsize)
4752 if (paren > *PL_reglastparen)
4753 *PL_reglastparen = paren;
4754 scan += NEXT_OFF(scan); /* Skip former OPEN. */
4762 ST.c1 = CHRTEST_UNINIT;
4765 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
4768 curlym_do_A: /* execute the A in /A{m,n}B/ */
4769 PL_reginput = locinput;
4770 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */
4773 case CURLYM_A: /* we've just matched an A */
4774 locinput = st->locinput;
4775 nextchr = UCHARAT(locinput);
4778 /* after first match, determine A's length: u.curlym.alen */
4779 if (ST.count == 1) {
4780 if (PL_reg_match_utf8) {
4782 while (s < PL_reginput) {
4788 ST.alen = PL_reginput - locinput;
4791 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
4794 PerlIO_printf(Perl_debug_log,
4795 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
4796 (int)(REPORT_CODE_OFF+(depth*2)), "",
4797 (IV) ST.count, (IV)ST.alen)
4800 locinput = PL_reginput;
4802 if (cur_eval && cur_eval->u.eval.close_paren &&
4803 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4807 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
4808 if ( max == REG_INFTY || ST.count < max )
4809 goto curlym_do_A; /* try to match another A */
4811 goto curlym_do_B; /* try to match B */
4813 case CURLYM_A_fail: /* just failed to match an A */
4814 REGCP_UNWIND(ST.cp);
4816 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
4817 || (cur_eval && cur_eval->u.eval.close_paren &&
4818 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
4821 curlym_do_B: /* execute the B in /A{m,n}B/ */
4822 PL_reginput = locinput;
4823 if (ST.c1 == CHRTEST_UNINIT) {
4824 /* calculate c1 and c2 for possible match of 1st char
4825 * following curly */
4826 ST.c1 = ST.c2 = CHRTEST_VOID;
4827 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
4828 regnode *text_node = ST.B;
4829 if (! HAS_TEXT(text_node))
4830 FIND_NEXT_IMPT(text_node);
4833 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
4835 But the former is redundant in light of the latter.
4837 if this changes back then the macro for
4838 IS_TEXT and friends need to change.
4840 if (PL_regkind[OP(text_node)] == EXACT)
4843 ST.c1 = (U8)*STRING(text_node);
4845 (IS_TEXTF(text_node))
4847 : (IS_TEXTFL(text_node))
4848 ? PL_fold_locale[ST.c1]
4855 PerlIO_printf(Perl_debug_log,
4856 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
4857 (int)(REPORT_CODE_OFF+(depth*2)),
4860 if (ST.c1 != CHRTEST_VOID
4861 && UCHARAT(PL_reginput) != ST.c1
4862 && UCHARAT(PL_reginput) != ST.c2)
4864 /* simulate B failing */
4866 PerlIO_printf(Perl_debug_log,
4867 "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n",
4868 (int)(REPORT_CODE_OFF+(depth*2)),"",
4871 state_num = CURLYM_B_fail;
4872 goto reenter_switch;
4876 /* mark current A as captured */
4877 I32 paren = ST.me->flags;
4879 PL_regoffs[paren].start
4880 = HOPc(PL_reginput, -ST.alen) - PL_bostr;
4881 PL_regoffs[paren].end = PL_reginput - PL_bostr;
4882 /*dmq: *PL_reglastcloseparen = paren; */
4885 PL_regoffs[paren].end = -1;
4886 if (cur_eval && cur_eval->u.eval.close_paren &&
4887 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
4896 PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */
4899 case CURLYM_B_fail: /* just failed to match a B */
4900 REGCP_UNWIND(ST.cp);
4902 I32 max = ARG2(ST.me);
4903 if (max != REG_INFTY && ST.count == max)
4905 goto curlym_do_A; /* try to match a further A */
4907 /* backtrack one A */
4908 if (ST.count == ARG1(ST.me) /* min */)
4911 locinput = HOPc(locinput, -ST.alen);
4912 goto curlym_do_B; /* try to match B */
4915 #define ST st->u.curly
4917 #define CURLY_SETPAREN(paren, success) \
4920 PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \
4921 PL_regoffs[paren].end = locinput - PL_bostr; \
4922 *PL_reglastcloseparen = paren; \
4925 PL_regoffs[paren].end = -1; \
4928 case STAR: /* /A*B/ where A is width 1 */
4932 scan = NEXTOPER(scan);
4934 case PLUS: /* /A+B/ where A is width 1 */
4938 scan = NEXTOPER(scan);
4940 case CURLYN: /* /(A){m,n}B/ where A is width 1 */
4941 ST.paren = scan->flags; /* Which paren to set */
4942 if (ST.paren > PL_regsize)
4943 PL_regsize = ST.paren;
4944 if (ST.paren > *PL_reglastparen)
4945 *PL_reglastparen = ST.paren;
4946 ST.min = ARG1(scan); /* min to match */
4947 ST.max = ARG2(scan); /* max to match */
4948 if (cur_eval && cur_eval->u.eval.close_paren &&
4949 cur_eval->u.eval.close_paren == (U32)ST.paren) {
4953 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
4955 case CURLY: /* /A{m,n}B/ where A is width 1 */
4957 ST.min = ARG1(scan); /* min to match */
4958 ST.max = ARG2(scan); /* max to match */
4959 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
4962 * Lookahead to avoid useless match attempts
4963 * when we know what character comes next.
4965 * Used to only do .*x and .*?x, but now it allows
4966 * for )'s, ('s and (?{ ... })'s to be in the way
4967 * of the quantifier and the EXACT-like node. -- japhy
4970 if (ST.min > ST.max) /* XXX make this a compile-time check? */
4972 if (HAS_TEXT(next) || JUMPABLE(next)) {
4974 regnode *text_node = next;
4976 if (! HAS_TEXT(text_node))
4977 FIND_NEXT_IMPT(text_node);
4979 if (! HAS_TEXT(text_node))
4980 ST.c1 = ST.c2 = CHRTEST_VOID;
4982 if ( PL_regkind[OP(text_node)] != EXACT ) {
4983 ST.c1 = ST.c2 = CHRTEST_VOID;
4984 goto assume_ok_easy;
4987 s = (U8*)STRING(text_node);
4989 /* Currently we only get here when
4991 PL_rekind[OP(text_node)] == EXACT
4993 if this changes back then the macro for IS_TEXT and
4994 friends need to change. */
4997 if (IS_TEXTF(text_node))
4998 ST.c2 = PL_fold[ST.c1];
4999 else if (IS_TEXTFL(text_node))
5000 ST.c2 = PL_fold_locale[ST.c1];
5002 else { /* UTF_PATTERN */
5003 if (IS_TEXTF(text_node)) {
5004 STRLEN ulen1, ulen2;
5005 U8 tmpbuf1[UTF8_MAXBYTES_CASE+1];
5006 U8 tmpbuf2[UTF8_MAXBYTES_CASE+1];
5008 to_utf8_lower((U8*)s, tmpbuf1, &ulen1);
5009 to_utf8_upper((U8*)s, tmpbuf2, &ulen2);
5011 ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0,
5013 0 : UTF8_ALLOW_ANY);
5014 ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0,
5016 0 : UTF8_ALLOW_ANY);
5018 ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0,
5020 ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0,
5025 ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0,
5032 ST.c1 = ST.c2 = CHRTEST_VOID;
5037 PL_reginput = locinput;
5040 if (ST.min && regrepeat(rex, ST.A, ST.min, depth) < ST.min)
5043 locinput = PL_reginput;
5045 if (ST.c1 == CHRTEST_VOID)
5046 goto curly_try_B_min;
5048 ST.oldloc = locinput;
5050 /* set ST.maxpos to the furthest point along the
5051 * string that could possibly match */
5052 if (ST.max == REG_INFTY) {
5053 ST.maxpos = PL_regeol - 1;
5055 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
5058 else if (utf8_target) {
5059 int m = ST.max - ST.min;
5060 for (ST.maxpos = locinput;
5061 m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--)
5062 ST.maxpos += UTF8SKIP(ST.maxpos);
5065 ST.maxpos = locinput + ST.max - ST.min;
5066 if (ST.maxpos >= PL_regeol)
5067 ST.maxpos = PL_regeol - 1;
5069 goto curly_try_B_min_known;
5073 ST.count = regrepeat(rex, ST.A, ST.max, depth);
5074 locinput = PL_reginput;
5075 if (ST.count < ST.min)
5077 if ((ST.count > ST.min)
5078 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
5080 /* A{m,n} must come at the end of the string, there's
5081 * no point in backing off ... */
5083 /* ...except that $ and \Z can match before *and* after
5084 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
5085 We may back off by one in this case. */
5086 if (UCHARAT(PL_reginput - 1) == '\n' && OP(ST.B) != EOS)
5090 goto curly_try_B_max;
5095 case CURLY_B_min_known_fail:
5096 /* failed to find B in a non-greedy match where c1,c2 valid */
5097 if (ST.paren && ST.count)
5098 PL_regoffs[ST.paren].end = -1;
5100 PL_reginput = locinput; /* Could be reset... */
5101 REGCP_UNWIND(ST.cp);
5102 /* Couldn't or didn't -- move forward. */
5103 ST.oldloc = locinput;
5105 locinput += UTF8SKIP(locinput);
5109 curly_try_B_min_known:
5110 /* find the next place where 'B' could work, then call B */
5114 n = (ST.oldloc == locinput) ? 0 : 1;
5115 if (ST.c1 == ST.c2) {
5117 /* set n to utf8_distance(oldloc, locinput) */
5118 while (locinput <= ST.maxpos &&
5119 utf8n_to_uvchr((U8*)locinput,
5120 UTF8_MAXBYTES, &len,
5121 uniflags) != (UV)ST.c1) {
5127 /* set n to utf8_distance(oldloc, locinput) */
5128 while (locinput <= ST.maxpos) {
5130 const UV c = utf8n_to_uvchr((U8*)locinput,
5131 UTF8_MAXBYTES, &len,
5133 if (c == (UV)ST.c1 || c == (UV)ST.c2)
5141 if (ST.c1 == ST.c2) {
5142 while (locinput <= ST.maxpos &&
5143 UCHARAT(locinput) != ST.c1)
5147 while (locinput <= ST.maxpos
5148 && UCHARAT(locinput) != ST.c1
5149 && UCHARAT(locinput) != ST.c2)
5152 n = locinput - ST.oldloc;
5154 if (locinput > ST.maxpos)
5156 /* PL_reginput == oldloc now */
5159 if (regrepeat(rex, ST.A, n, depth) < n)
5162 PL_reginput = locinput;
5163 CURLY_SETPAREN(ST.paren, ST.count);
5164 if (cur_eval && cur_eval->u.eval.close_paren &&
5165 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5168 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B);
5173 case CURLY_B_min_fail:
5174 /* failed to find B in a non-greedy match where c1,c2 invalid */
5175 if (ST.paren && ST.count)
5176 PL_regoffs[ST.paren].end = -1;
5178 REGCP_UNWIND(ST.cp);
5179 /* failed -- move forward one */
5180 PL_reginput = locinput;
5181 if (regrepeat(rex, ST.A, 1, depth)) {
5183 locinput = PL_reginput;
5184 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
5185 ST.count > 0)) /* count overflow ? */
5188 CURLY_SETPAREN(ST.paren, ST.count);
5189 if (cur_eval && cur_eval->u.eval.close_paren &&
5190 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5193 PUSH_STATE_GOTO(CURLY_B_min, ST.B);
5201 /* a successful greedy match: now try to match B */
5202 if (cur_eval && cur_eval->u.eval.close_paren &&
5203 cur_eval->u.eval.close_paren == (U32)ST.paren) {
5208 if (ST.c1 != CHRTEST_VOID)
5209 c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput,
5210 UTF8_MAXBYTES, 0, uniflags)
5211 : (UV) UCHARAT(PL_reginput);
5212 /* If it could work, try it. */
5213 if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) {
5214 CURLY_SETPAREN(ST.paren, ST.count);
5215 PUSH_STATE_GOTO(CURLY_B_max, ST.B);
5220 case CURLY_B_max_fail:
5221 /* failed to find B in a greedy match */
5222 if (ST.paren && ST.count)
5223 PL_regoffs[ST.paren].end = -1;
5225 REGCP_UNWIND(ST.cp);
5227 if (--ST.count < ST.min)
5229 PL_reginput = locinput = HOPc(locinput, -1);
5230 goto curly_try_B_max;
5237 /* we've just finished A in /(??{A})B/; now continue with B */
5239 st->u.eval.toggle_reg_flags
5240 = cur_eval->u.eval.toggle_reg_flags;
5241 PL_reg_flags ^= st->u.eval.toggle_reg_flags;
5243 st->u.eval.prev_rex = rex_sv; /* inner */
5244 SETREX(rex_sv,cur_eval->u.eval.prev_rex);
5245 rex = (struct regexp *)SvANY(rex_sv);
5246 rexi = RXi_GET(rex);
5247 cur_curlyx = cur_eval->u.eval.prev_curlyx;
5248 ReREFCNT_inc(rex_sv);
5249 st->u.eval.cp = regcppush(0); /* Save *all* the positions. */
5251 /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */
5252 PL_reglastparen = &rex->lastparen;
5253 PL_reglastcloseparen = &rex->lastcloseparen;
5255 REGCP_SET(st->u.eval.lastcp);
5256 PL_reginput = locinput;
5258 /* Restore parens of the outer rex without popping the
5260 tmpix = PL_savestack_ix;
5261 PL_savestack_ix = cur_eval->u.eval.lastcp;
5263 PL_savestack_ix = tmpix;
5265 st->u.eval.prev_eval = cur_eval;
5266 cur_eval = cur_eval->u.eval.prev_eval;
5268 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
5269 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
5270 if ( nochange_depth )
5273 PUSH_YES_STATE_GOTO(EVAL_AB,
5274 st->u.eval.prev_eval->u.eval.B); /* match B */
5277 if (locinput < reginfo->till) {
5278 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5279 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
5281 (long)(locinput - PL_reg_starttry),
5282 (long)(reginfo->till - PL_reg_starttry),
5285 sayNO_SILENT; /* Cannot match: too short. */
5287 PL_reginput = locinput; /* put where regtry can find it */
5288 sayYES; /* Success! */
5290 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
5292 PerlIO_printf(Perl_debug_log,
5293 "%*s %ssubpattern success...%s\n",
5294 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
5295 PL_reginput = locinput; /* put where regtry can find it */
5296 sayYES; /* Success! */
5299 #define ST st->u.ifmatch
5301 case SUSPEND: /* (?>A) */
5303 PL_reginput = locinput;
5306 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
5308 goto ifmatch_trivial_fail_test;
5310 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
5312 ifmatch_trivial_fail_test:
5314 char * const s = HOPBACKc(locinput, scan->flags);
5319 sw = 1 - cBOOL(ST.wanted);
5323 next = scan + ARG(scan);
5331 PL_reginput = locinput;
5335 ST.logical = logical;
5336 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
5338 /* execute body of (?...A) */
5339 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)));
5342 case IFMATCH_A_fail: /* body of (?...A) failed */
5343 ST.wanted = !ST.wanted;
5346 case IFMATCH_A: /* body of (?...A) succeeded */
5348 sw = cBOOL(ST.wanted);
5350 else if (!ST.wanted)
5353 if (OP(ST.me) == SUSPEND)
5354 locinput = PL_reginput;
5356 locinput = PL_reginput = st->locinput;
5357 nextchr = UCHARAT(locinput);
5359 scan = ST.me + ARG(ST.me);
5362 continue; /* execute B */
5367 next = scan + ARG(scan);
5372 reginfo->cutpoint = PL_regeol;
5375 PL_reginput = locinput;
5377 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5378 PUSH_STATE_GOTO(COMMIT_next,next);
5380 case COMMIT_next_fail:
5387 #define ST st->u.mark
5389 ST.prev_mark = mark_state;
5390 ST.mark_name = sv_commit = sv_yes_mark
5391 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5393 ST.mark_loc = PL_reginput = locinput;
5394 PUSH_YES_STATE_GOTO(MARKPOINT_next,next);
5396 case MARKPOINT_next:
5397 mark_state = ST.prev_mark;
5400 case MARKPOINT_next_fail:
5401 if (popmark && sv_eq(ST.mark_name,popmark))
5403 if (ST.mark_loc > startpoint)
5404 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5405 popmark = NULL; /* we found our mark */
5406 sv_commit = ST.mark_name;
5409 PerlIO_printf(Perl_debug_log,
5410 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
5411 REPORT_CODE_OFF+depth*2, "",
5412 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
5415 mark_state = ST.prev_mark;
5416 sv_yes_mark = mark_state ?
5417 mark_state->u.mark.mark_name : NULL;
5421 PL_reginput = locinput;
5423 /* (*SKIP) : if we fail we cut here*/
5424 ST.mark_name = NULL;
5425 ST.mark_loc = locinput;
5426 PUSH_STATE_GOTO(SKIP_next,next);
5428 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
5429 otherwise do nothing. Meaning we need to scan
5431 regmatch_state *cur = mark_state;
5432 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
5435 if ( sv_eq( cur->u.mark.mark_name,
5438 ST.mark_name = find;
5439 PUSH_STATE_GOTO( SKIP_next, next );
5441 cur = cur->u.mark.prev_mark;
5444 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
5446 case SKIP_next_fail:
5448 /* (*CUT:NAME) - Set up to search for the name as we
5449 collapse the stack*/
5450 popmark = ST.mark_name;
5452 /* (*CUT) - No name, we cut here.*/
5453 if (ST.mark_loc > startpoint)
5454 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
5455 /* but we set sv_commit to latest mark_name if there
5456 is one so they can test to see how things lead to this
5459 sv_commit=mark_state->u.mark.mark_name;
5467 if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) {
5469 } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) {
5472 U8 folded[UTF8_MAXBYTES_CASE+1];
5474 const char * const l = locinput;
5475 char *e = PL_regeol;
5476 to_uni_fold(n, folded, &foldlen);
5478 if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1,
5479 l, &e, 0, utf8_target)) {
5484 nextchr = UCHARAT(locinput);
5487 if ((n=is_LNBREAK(locinput,utf8_target))) {
5489 nextchr = UCHARAT(locinput);
5494 #define CASE_CLASS(nAmE) \
5496 if ((n=is_##nAmE(locinput,utf8_target))) { \
5498 nextchr = UCHARAT(locinput); \
5503 if ((n=is_##nAmE(locinput,utf8_target))) { \
5506 locinput += UTF8SKIP(locinput); \
5507 nextchr = UCHARAT(locinput); \
5512 CASE_CLASS(HORIZWS);
5516 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
5517 PTR2UV(scan), OP(scan));
5518 Perl_croak(aTHX_ "regexp memory corruption");
5522 /* switch break jumps here */
5523 scan = next; /* prepare to execute the next op and ... */
5524 continue; /* ... jump back to the top, reusing st */
5528 /* push a state that backtracks on success */
5529 st->u.yes.prev_yes_state = yes_state;
5533 /* push a new regex state, then continue at scan */
5535 regmatch_state *newst;
5538 regmatch_state *cur = st;
5539 regmatch_state *curyes = yes_state;
5541 regmatch_slab *slab = PL_regmatch_slab;
5542 for (;curd > -1;cur--,curd--) {
5543 if (cur < SLAB_FIRST(slab)) {
5545 cur = SLAB_LAST(slab);
5547 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
5548 REPORT_CODE_OFF + 2 + depth * 2,"",
5549 curd, PL_reg_name[cur->resume_state],
5550 (curyes == cur) ? "yes" : ""
5553 curyes = cur->u.yes.prev_yes_state;
5556 DEBUG_STATE_pp("push")
5559 st->locinput = locinput;
5561 if (newst > SLAB_LAST(PL_regmatch_slab))
5562 newst = S_push_slab(aTHX);
5563 PL_regmatch_state = newst;
5565 locinput = PL_reginput;
5566 nextchr = UCHARAT(locinput);
5574 * We get here only if there's trouble -- normally "case END" is
5575 * the terminating point.
5577 Perl_croak(aTHX_ "corrupted regexp pointers");
5583 /* we have successfully completed a subexpression, but we must now
5584 * pop to the state marked by yes_state and continue from there */
5585 assert(st != yes_state);
5587 while (st != yes_state) {
5589 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5590 PL_regmatch_slab = PL_regmatch_slab->prev;
5591 st = SLAB_LAST(PL_regmatch_slab);
5595 DEBUG_STATE_pp("pop (no final)");
5597 DEBUG_STATE_pp("pop (yes)");
5603 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
5604 || yes_state > SLAB_LAST(PL_regmatch_slab))
5606 /* not in this slab, pop slab */
5607 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
5608 PL_regmatch_slab = PL_regmatch_slab->prev;
5609 st = SLAB_LAST(PL_regmatch_slab);
5611 depth -= (st - yes_state);
5614 yes_state = st->u.yes.prev_yes_state;
5615 PL_regmatch_state = st;
5618 locinput= st->locinput;
5619 nextchr = UCHARAT(locinput);
5621 state_num = st->resume_state + no_final;
5622 goto reenter_switch;
5625 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
5626 PL_colors[4], PL_colors[5]));
5628 if (PL_reg_eval_set) {
5629 /* each successfully executed (?{...}) block does the equivalent of
5630 * local $^R = do {...}
5631 * When popping the save stack, all these locals would be undone;
5632 * bypass this by setting the outermost saved $^R to the latest
5634 if (oreplsv != GvSV(PL_replgv))
5635 sv_setsv(oreplsv, GvSV(PL_replgv));
5642 PerlIO_printf(Perl_debug_log,
5643 "%*s %sfailed...%s\n",
5644 REPORT_CODE_OFF+depth*2, "",
5645 PL_colors[4], PL_colors[5])
5657 /* there's a previous state to backtrack to */
5659 if (st < SLAB_FIRST(PL_regmatch_slab)) {
5660 PL_regmatch_slab = PL_regmatch_slab->prev;
5661 st = SLAB_LAST(PL_regmatch_slab);
5663 PL_regmatch_state = st;
5664 locinput= st->locinput;
5665 nextchr = UCHARAT(locinput);
5667 DEBUG_STATE_pp("pop");
5669 if (yes_state == st)
5670 yes_state = st->u.yes.prev_yes_state;
5672 state_num = st->resume_state + 1; /* failure = success + 1 */
5673 goto reenter_switch;
5678 if (rex->intflags & PREGf_VERBARG_SEEN) {
5679 SV *sv_err = get_sv("REGERROR", 1);
5680 SV *sv_mrk = get_sv("REGMARK", 1);
5682 sv_commit = &PL_sv_no;
5684 sv_yes_mark = &PL_sv_yes;
5687 sv_commit = &PL_sv_yes;
5688 sv_yes_mark = &PL_sv_no;
5690 sv_setsv(sv_err, sv_commit);
5691 sv_setsv(sv_mrk, sv_yes_mark);
5694 /* clean up; in particular, free all slabs above current one */
5695 LEAVE_SCOPE(oldsave);
5701 - regrepeat - repeatedly match something simple, report how many
5704 * [This routine now assumes that it will only match on things of length 1.
5705 * That was true before, but now we assume scan - reginput is the count,
5706 * rather than incrementing count on every character. [Er, except utf8.]]
5709 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth)
5712 register char *scan;
5714 register char *loceol = PL_regeol;
5715 register I32 hardcount = 0;
5716 register bool utf8_target = PL_reg_match_utf8;
5718 PERL_UNUSED_ARG(depth);
5721 PERL_ARGS_ASSERT_REGREPEAT;
5724 if (max == REG_INFTY)
5726 else if (max < loceol - scan)
5727 loceol = scan + max;
5732 while (scan < loceol && hardcount < max && *scan != '\n') {
5733 scan += UTF8SKIP(scan);
5737 while (scan < loceol && *scan != '\n')
5744 while (scan < loceol && hardcount < max) {
5745 scan += UTF8SKIP(scan);
5755 case EXACT: /* length of string is 1 */
5757 while (scan < loceol && UCHARAT(scan) == c)
5760 case EXACTF: /* length of string is 1 */
5762 while (scan < loceol &&
5763 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c]))
5766 case EXACTFL: /* length of string is 1 */
5767 PL_reg_flags |= RF_tainted;
5769 while (scan < loceol &&
5770 (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c]))
5776 while (hardcount < max && scan < loceol &&
5777 reginclass(prog, p, (U8*)scan, 0, utf8_target)) {
5778 scan += UTF8SKIP(scan);
5782 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
5789 LOAD_UTF8_CHARCLASS_ALNUM();
5790 while (hardcount < max && scan < loceol &&
5791 swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5793 scan += UTF8SKIP(scan);
5796 } else if (FLAGS(p) & USE_UNI) {
5797 while (scan < loceol && isWORDCHAR_L1((U8) *scan)) {
5801 while (scan < loceol && isALNUM((U8) *scan)) {
5807 PL_reg_flags |= RF_tainted;
5810 while (hardcount < max && scan < loceol &&
5811 isALNUM_LC_utf8((U8*)scan)) {
5812 scan += UTF8SKIP(scan);
5816 while (scan < loceol && isALNUM_LC(*scan))
5823 LOAD_UTF8_CHARCLASS_ALNUM();
5824 while (hardcount < max && scan < loceol &&
5825 !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target))
5827 scan += UTF8SKIP(scan);
5830 } else if (FLAGS(p) & USE_UNI) {
5831 while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) {
5835 while (scan < loceol && ! isALNUM((U8) *scan)) {
5841 PL_reg_flags |= RF_tainted;
5844 while (hardcount < max && scan < loceol &&
5845 !isALNUM_LC_utf8((U8*)scan)) {
5846 scan += UTF8SKIP(scan);
5850 while (scan < loceol && !isALNUM_LC(*scan))
5857 LOAD_UTF8_CHARCLASS_SPACE();
5858 while (hardcount < max && scan < loceol &&
5860 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5862 scan += UTF8SKIP(scan);
5865 } else if (FLAGS(p) & USE_UNI) {
5866 while (scan < loceol && isSPACE_L1((U8) *scan)) {
5870 while (scan < loceol && isSPACE((U8) *scan))
5875 PL_reg_flags |= RF_tainted;
5878 while (hardcount < max && scan < loceol &&
5879 (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5880 scan += UTF8SKIP(scan);
5884 while (scan < loceol && isSPACE_LC(*scan))
5891 LOAD_UTF8_CHARCLASS_SPACE();
5892 while (hardcount < max && scan < loceol &&
5894 swash_fetch(PL_utf8_space,(U8*)scan, utf8_target)))
5896 scan += UTF8SKIP(scan);
5899 } else if (FLAGS(p) & USE_UNI) {
5900 while (scan < loceol && ! isSPACE_L1((U8) *scan)) {
5904 while (scan < loceol && ! isSPACE((U8) *scan)) {
5910 PL_reg_flags |= RF_tainted;
5913 while (hardcount < max && scan < loceol &&
5914 !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
5915 scan += UTF8SKIP(scan);
5919 while (scan < loceol && !isSPACE_LC(*scan))
5926 LOAD_UTF8_CHARCLASS_DIGIT();
5927 while (hardcount < max && scan < loceol &&
5928 swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5929 scan += UTF8SKIP(scan);
5933 while (scan < loceol && isDIGIT(*scan))
5940 LOAD_UTF8_CHARCLASS_DIGIT();
5941 while (hardcount < max && scan < loceol &&
5942 !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) {
5943 scan += UTF8SKIP(scan);
5947 while (scan < loceol && !isDIGIT(*scan))
5953 while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) {
5959 LNBREAK can match two latin chars, which is ok,
5960 because we have a null terminated string, but we
5961 have to use hardcount in this situation
5963 while (scan < loceol && (c=is_LNBREAK_latin1(scan))) {
5972 while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) {
5977 while (scan < loceol && is_HORIZWS_latin1(scan))
5984 while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) {
5985 scan += UTF8SKIP(scan);
5989 while (scan < loceol && !is_HORIZWS_latin1(scan))
5997 while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) {
6002 while (scan < loceol && is_VERTWS_latin1(scan))
6010 while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) {
6011 scan += UTF8SKIP(scan);
6015 while (scan < loceol && !is_VERTWS_latin1(scan))
6021 default: /* Called on something of 0 width. */
6022 break; /* So match right here or not at all. */
6028 c = scan - PL_reginput;
6032 GET_RE_DEBUG_FLAGS_DECL;
6034 SV * const prop = sv_newmortal();
6035 regprop(prog, prop, p);
6036 PerlIO_printf(Perl_debug_log,
6037 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
6038 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
6046 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
6048 - regclass_swash - prepare the utf8 swash
6052 Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
6058 RXi_GET_DECL(prog,progi);
6059 const struct reg_data * const data = prog ? progi->data : NULL;
6061 PERL_ARGS_ASSERT_REGCLASS_SWASH;
6063 if (data && data->count) {
6064 const U32 n = ARG(node);
6066 if (data->what[n] == 's') {
6067 SV * const rv = MUTABLE_SV(data->data[n]);
6068 AV * const av = MUTABLE_AV(SvRV(rv));
6069 SV **const ary = AvARRAY(av);
6072 /* See the end of regcomp.c:S_regclass() for
6073 * documentation of these array elements. */
6076 a = SvROK(ary[1]) ? &ary[1] : NULL;
6077 b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL;
6081 else if (si && doinit) {
6082 sw = swash_init("utf8", "", si, 1, 0);
6083 (void)av_store(av, 1, sw);
6100 - reginclass - determine if a character falls into a character class
6102 The n is the ANYOF regnode, the p is the target string, lenp
6103 is pointer to the maximum length of how far to go in the p
6104 (if the lenp is zero, UTF8SKIP(p) is used),
6105 utf8_target tells whether the target string is in UTF-8.
6110 S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target)
6113 const char flags = ANYOF_FLAGS(n);
6119 PERL_ARGS_ASSERT_REGINCLASS;
6121 if (utf8_target && !UTF8_IS_INVARIANT(c)) {
6122 c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len,
6123 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
6124 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
6125 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
6126 * UTF8_ALLOW_FFFF */
6127 if (len == (STRLEN)-1)
6128 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
6131 plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c));
6132 if (utf8_target || (flags & ANYOF_UNICODE)) {
6135 if (utf8_target && !ANYOF_RUNTIME(n)) {
6136 if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c))
6139 if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256)
6143 SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);
6151 utf8_p = bytes_to_utf8(p, &len);
6153 if (swash_fetch(sw, utf8_p, 1))
6155 else if (flags & ANYOF_FOLD) {
6156 if (!match && lenp && av) {
6158 for (i = 0; i <= av_len(av); i++) {
6159 SV* const sv = *av_fetch(av, i, FALSE);
6161 const char * const s = SvPV_const(sv, len);
6162 if (len <= plen && memEQ(s, (char*)utf8_p, len)) {
6170 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
6173 to_utf8_fold(utf8_p, tmpbuf, &tmplen);
6174 if (swash_fetch(sw, tmpbuf, 1))
6179 /* If we allocated a string above, free it */
6180 if (! utf8_target) Safefree(utf8_p);
6183 if (match && lenp && *lenp == 0)
6184 *lenp = UNISKIP(NATIVE_TO_UNI(c));
6186 if (!match && c < 256) {
6187 if (ANYOF_BITMAP_TEST(n, c))
6189 else if (flags & ANYOF_FOLD) {
6192 if (flags & ANYOF_LOCALE) {
6193 PL_reg_flags |= RF_tainted;
6194 f = PL_fold_locale[c];
6198 if (f != c && ANYOF_BITMAP_TEST(n, f))
6202 if (!match && (flags & ANYOF_CLASS)) {
6203 PL_reg_flags |= RF_tainted;
6205 (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) ||
6206 (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) ||
6207 (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) ||
6208 (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) ||
6209 (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) ||
6210 (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) ||
6211 (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) ||
6212 (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) ||
6213 (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) ||
6214 (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) ||
6215 (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) ||
6216 (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) ||
6217 (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) ||
6218 (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) ||
6219 (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) ||
6220 (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) ||
6221 (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) ||
6222 (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) ||
6223 (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) ||
6224 (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) ||
6225 (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) ||
6226 (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) ||
6227 (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) ||
6228 (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) ||
6229 (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) ||
6230 (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) ||
6231 (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) ||
6232 (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) ||
6233 (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) ||
6234 (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c))
6235 ) /* How's that for a conditional? */
6242 return (flags & ANYOF_INVERT) ? !match : match;
6246 S_reghop3(U8 *s, I32 off, const U8* lim)
6250 PERL_ARGS_ASSERT_REGHOP3;
6253 while (off-- && s < lim) {
6254 /* XXX could check well-formedness here */
6259 while (off++ && s > lim) {
6261 if (UTF8_IS_CONTINUED(*s)) {
6262 while (s > lim && UTF8_IS_CONTINUATION(*s))
6265 /* XXX could check well-formedness here */
6272 /* there are a bunch of places where we use two reghop3's that should
6273 be replaced with this routine. but since thats not done yet
6274 we ifdef it out - dmq
6277 S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim)
6281 PERL_ARGS_ASSERT_REGHOP4;
6284 while (off-- && s < rlim) {
6285 /* XXX could check well-formedness here */
6290 while (off++ && s > llim) {
6292 if (UTF8_IS_CONTINUED(*s)) {
6293 while (s > llim && UTF8_IS_CONTINUATION(*s))
6296 /* XXX could check well-formedness here */
6304 S_reghopmaybe3(U8* s, I32 off, const U8* lim)
6308 PERL_ARGS_ASSERT_REGHOPMAYBE3;
6311 while (off-- && s < lim) {
6312 /* XXX could check well-formedness here */
6319 while (off++ && s > lim) {
6321 if (UTF8_IS_CONTINUED(*s)) {
6322 while (s > lim && UTF8_IS_CONTINUATION(*s))
6325 /* XXX could check well-formedness here */
6334 restore_pos(pTHX_ void *arg)
6337 regexp * const rex = (regexp *)arg;
6338 if (PL_reg_eval_set) {
6339 if (PL_reg_oldsaved) {
6340 rex->subbeg = PL_reg_oldsaved;
6341 rex->sublen = PL_reg_oldsavedlen;
6342 #ifdef PERL_OLD_COPY_ON_WRITE
6343 rex->saved_copy = PL_nrs;
6345 RXp_MATCH_COPIED_on(rex);
6347 PL_reg_magic->mg_len = PL_reg_oldpos;
6348 PL_reg_eval_set = 0;
6349 PL_curpm = PL_reg_oldcurpm;
6354 S_to_utf8_substr(pTHX_ register regexp *prog)
6358 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
6361 if (prog->substrs->data[i].substr
6362 && !prog->substrs->data[i].utf8_substr) {
6363 SV* const sv = newSVsv(prog->substrs->data[i].substr);
6364 prog->substrs->data[i].utf8_substr = sv;
6365 sv_utf8_upgrade(sv);
6366 if (SvVALID(prog->substrs->data[i].substr)) {
6367 const U8 flags = BmFLAGS(prog->substrs->data[i].substr);
6368 if (flags & FBMcf_TAIL) {
6369 /* Trim the trailing \n that fbm_compile added last
6371 SvCUR_set(sv, SvCUR(sv) - 1);
6372 /* Whilst this makes the SV technically "invalid" (as its
6373 buffer is no longer followed by "\0") when fbm_compile()
6374 adds the "\n" back, a "\0" is restored. */
6376 fbm_compile(sv, flags);
6378 if (prog->substrs->data[i].substr == prog->check_substr)
6379 prog->check_utf8 = sv;
6385 S_to_byte_substr(pTHX_ register regexp *prog)
6390 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
6393 if (prog->substrs->data[i].utf8_substr
6394 && !prog->substrs->data[i].substr) {
6395 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
6396 if (sv_utf8_downgrade(sv, TRUE)) {
6397 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
6399 = BmFLAGS(prog->substrs->data[i].utf8_substr);
6400 if (flags & FBMcf_TAIL) {
6401 /* Trim the trailing \n that fbm_compile added last
6403 SvCUR_set(sv, SvCUR(sv) - 1);
6405 fbm_compile(sv, flags);
6411 prog->substrs->data[i].substr = sv;
6412 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
6413 prog->check_substr = sv;
6420 * c-indentation-style: bsd
6422 * indent-tabs-mode: t
6425 * ex: set ts=8 sts=4 sw=4 noet: