5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
41 * pregcomp and pregexec -- regsub and regerror are not used in perl
43 * Copyright (c) 1986 by University of Toronto.
44 * Written by Henry Spencer. Not derived from licensed software.
46 * Permission is granted to anyone to use this software for any
47 * purpose on any computer system, and to redistribute it freely,
48 * subject to the following restrictions:
50 * 1. The author is not responsible for the consequences of use of
51 * this software, no matter how awful, even if they arise
54 * 2. The origin of this software must not be misrepresented, either
55 * by explicit claim or by omission.
57 * 3. Altered versions must be plainly marked as such, and must not
58 * be misrepresented as being the original software.
60 **** Alterations to Henry's code are...
62 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
63 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
64 **** by Larry Wall and others
66 **** You may distribute under the terms of either the GNU General Public
67 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGEXEC_C
75 #undef PERL_IN_XSUB_RE
76 #define PERL_IN_XSUB_RE 1
79 #undef PERL_IN_XSUB_RE
81 #ifdef PERL_IN_XSUB_RE
87 #include "inline_invlist.c"
88 #include "unicode_constants.h"
91 /* At least one required character in the target string is expressible only in
93 static const char* const non_utf8_target_but_utf8_required
94 = "Can't match, because target string needs to be in UTF-8\n";
97 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
98 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
102 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
105 #define STATIC static
108 /* Valid only for non-utf8 strings: avoids the reginclass
109 * call if there are no complications: i.e., if everything matchable is
110 * straight forward in the bitmap */
111 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
112 : ANYOF_BITMAP_TEST(p,*(c)))
118 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
119 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
121 #define HOPc(pos,off) \
122 (char *)(reginfo->is_utf8_target \
123 ? reghop3((U8*)pos, off, \
124 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
127 #define HOPBACKc(pos, off) \
128 (char*)(reginfo->is_utf8_target \
129 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
130 : (pos - off >= reginfo->strbeg) \
134 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
135 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
137 /* lim must be +ve. Returns NULL on overshoot */
138 #define HOPMAYBE3(pos,off,lim) \
139 (reginfo->is_utf8_target \
140 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
141 : ((U8*)pos + off <= lim) \
145 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
146 * off must be >=0; args should be vars rather than expressions */
147 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
148 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
149 : (U8*)((pos + off) > lim ? lim : (pos + off)))
151 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
152 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
154 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
156 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
157 #define NEXTCHR_IS_EOS (nextchr < 0)
159 #define SET_nextchr \
160 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
162 #define SET_locinput(p) \
167 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
169 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
170 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
171 1, 0, invlist, &flags); \
176 /* If in debug mode, we test that a known character properly matches */
178 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
181 utf8_char_in_property) \
182 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
183 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
185 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
188 utf8_char_in_property) \
189 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
192 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
193 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
195 PL_XPosix_ptrs[_CC_WORDCHAR], \
196 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
198 #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \
200 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \
201 "_X_regular_begin", \
203 LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \
204 LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \
207 COMBINING_GRAVE_ACCENT_UTF8); \
210 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
211 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
213 /* for use after a quantifier and before an EXACT-like node -- japhy */
214 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
216 * NOTE that *nothing* that affects backtracking should be in here, specifically
217 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
218 * node that is in between two EXACT like nodes when ascertaining what the required
219 * "follow" character is. This should probably be moved to regex compile time
220 * although it may be done at run time beause of the REF possibility - more
221 * investigation required. -- demerphq
223 #define JUMPABLE(rn) ( \
225 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
227 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
228 OP(rn) == PLUS || OP(rn) == MINMOD || \
230 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
232 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
234 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
237 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
238 we don't need this definition. */
239 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
240 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
241 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
244 /* ... so we use this as its faster. */
245 #define IS_TEXT(rn) ( OP(rn)==EXACT )
246 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
247 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
248 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
253 Search for mandatory following text node; for lookahead, the text must
254 follow but for lookbehind (rn->flags != 0) we skip to the next step.
256 #define FIND_NEXT_IMPT(rn) STMT_START { \
257 while (JUMPABLE(rn)) { \
258 const OPCODE type = OP(rn); \
259 if (type == SUSPEND || PL_regkind[type] == CURLY) \
260 rn = NEXTOPER(NEXTOPER(rn)); \
261 else if (type == PLUS) \
263 else if (type == IFMATCH) \
264 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
265 else rn += NEXT_OFF(rn); \
269 /* These constants are for finding GCB=LV and GCB=LVT in the CLUMP regnode.
270 * These are for the pre-composed Hangul syllables, which are all in a
271 * contiguous block and arranged there in such a way so as to facilitate
272 * alorithmic determination of their characteristics. As such, they don't need
273 * a swash, but can be determined by simple arithmetic. Almost all are
274 * GCB=LVT, but every 28th one is a GCB=LV */
275 #define SBASE 0xAC00 /* Start of block */
276 #define SCount 11172 /* Length of block */
279 #define SLAB_FIRST(s) (&(s)->states[0])
280 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
282 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
283 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
284 static regmatch_state * S_push_slab(pTHX);
286 #define REGCP_PAREN_ELEMS 3
287 #define REGCP_OTHER_ELEMS 3
288 #define REGCP_FRAME_ELEMS 1
289 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
290 * are needed for the regexp context stack bookkeeping. */
293 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
296 const int retval = PL_savestack_ix;
297 const int paren_elems_to_push =
298 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
299 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
300 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
302 GET_RE_DEBUG_FLAGS_DECL;
304 PERL_ARGS_ASSERT_REGCPPUSH;
306 if (paren_elems_to_push < 0)
307 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %i",
308 paren_elems_to_push, maxopenparen, parenfloor, REGCP_PAREN_ELEMS);
310 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
311 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
312 " out of range (%lu-%ld)",
314 (unsigned long)maxopenparen,
317 SSGROW(total_elems + REGCP_FRAME_ELEMS);
320 if ((int)maxopenparen > (int)parenfloor)
321 PerlIO_printf(Perl_debug_log,
322 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
327 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
328 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
329 SSPUSHIV(rex->offs[p].end);
330 SSPUSHIV(rex->offs[p].start);
331 SSPUSHINT(rex->offs[p].start_tmp);
332 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
333 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
335 (IV)rex->offs[p].start,
336 (IV)rex->offs[p].start_tmp,
340 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
341 SSPUSHINT(maxopenparen);
342 SSPUSHINT(rex->lastparen);
343 SSPUSHINT(rex->lastcloseparen);
344 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
349 /* These are needed since we do not localize EVAL nodes: */
350 #define REGCP_SET(cp) \
352 PerlIO_printf(Perl_debug_log, \
353 " Setting an EVAL scope, savestack=%"IVdf"\n", \
354 (IV)PL_savestack_ix)); \
357 #define REGCP_UNWIND(cp) \
359 if (cp != PL_savestack_ix) \
360 PerlIO_printf(Perl_debug_log, \
361 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
362 (IV)(cp), (IV)PL_savestack_ix)); \
365 #define UNWIND_PAREN(lp, lcp) \
366 for (n = rex->lastparen; n > lp; n--) \
367 rex->offs[n].end = -1; \
368 rex->lastparen = n; \
369 rex->lastcloseparen = lcp;
373 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
378 GET_RE_DEBUG_FLAGS_DECL;
380 PERL_ARGS_ASSERT_REGCPPOP;
382 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
384 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
385 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
386 rex->lastcloseparen = SSPOPINT;
387 rex->lastparen = SSPOPINT;
388 *maxopenparen_p = SSPOPINT;
390 i -= REGCP_OTHER_ELEMS;
391 /* Now restore the parentheses context. */
393 if (i || rex->lastparen + 1 <= rex->nparens)
394 PerlIO_printf(Perl_debug_log,
395 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
400 paren = *maxopenparen_p;
401 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
403 rex->offs[paren].start_tmp = SSPOPINT;
404 rex->offs[paren].start = SSPOPIV;
406 if (paren <= rex->lastparen)
407 rex->offs[paren].end = tmps;
408 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
409 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
411 (IV)rex->offs[paren].start,
412 (IV)rex->offs[paren].start_tmp,
413 (IV)rex->offs[paren].end,
414 (paren > rex->lastparen ? "(skipped)" : ""));
419 /* It would seem that the similar code in regtry()
420 * already takes care of this, and in fact it is in
421 * a better location to since this code can #if 0-ed out
422 * but the code in regtry() is needed or otherwise tests
423 * requiring null fields (pat.t#187 and split.t#{13,14}
424 * (as of patchlevel 7877) will fail. Then again,
425 * this code seems to be necessary or otherwise
426 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
427 * --jhi updated by dapm */
428 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
429 if (i > *maxopenparen_p)
430 rex->offs[i].start = -1;
431 rex->offs[i].end = -1;
432 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
433 " \\%"UVuf": %s ..-1 undeffing\n",
435 (i > *maxopenparen_p) ? "-1" : " "
441 /* restore the parens and associated vars at savestack position ix,
442 * but without popping the stack */
445 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
447 I32 tmpix = PL_savestack_ix;
448 PL_savestack_ix = ix;
449 regcppop(rex, maxopenparen_p);
450 PL_savestack_ix = tmpix;
453 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
456 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
458 /* Returns a boolean as to whether or not 'character' is a member of the
459 * Posix character class given by 'classnum' that should be equivalent to a
460 * value in the typedef '_char_class_number'.
462 * Ideally this could be replaced by a just an array of function pointers
463 * to the C library functions that implement the macros this calls.
464 * However, to compile, the precise function signatures are required, and
465 * these may vary from platform to to platform. To avoid having to figure
466 * out what those all are on each platform, I (khw) am using this method,
467 * which adds an extra layer of function call overhead (unless the C
468 * optimizer strips it away). But we don't particularly care about
469 * performance with locales anyway. */
471 switch ((_char_class_number) classnum) {
472 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
473 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
474 case _CC_ENUM_ASCII: return isASCII_LC(character);
475 case _CC_ENUM_BLANK: return isBLANK_LC(character);
476 case _CC_ENUM_CASED: return isLOWER_LC(character)
477 || isUPPER_LC(character);
478 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
479 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
480 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
481 case _CC_ENUM_LOWER: return isLOWER_LC(character);
482 case _CC_ENUM_PRINT: return isPRINT_LC(character);
483 case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
484 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
485 case _CC_ENUM_SPACE: return isSPACE_LC(character);
486 case _CC_ENUM_UPPER: return isUPPER_LC(character);
487 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
488 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
489 default: /* VERTSPACE should never occur in locales */
490 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
493 assert(0); /* NOTREACHED */
498 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
500 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
501 * 'character' is a member of the Posix character class given by 'classnum'
502 * that should be equivalent to a value in the typedef
503 * '_char_class_number'.
505 * This just calls isFOO_lc on the code point for the character if it is in
506 * the range 0-255. Outside that range, all characters avoid Unicode
507 * rules, ignoring any locale. So use the Unicode function if this class
508 * requires a swash, and use the Unicode macro otherwise. */
510 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
512 if (UTF8_IS_INVARIANT(*character)) {
513 return isFOO_lc(classnum, *character);
515 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
516 return isFOO_lc(classnum,
517 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
520 if (classnum < _FIRST_NON_SWASH_CC) {
522 /* Initialize the swash unless done already */
523 if (! PL_utf8_swash_ptrs[classnum]) {
524 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
525 PL_utf8_swash_ptrs[classnum] =
526 _core_swash_init("utf8",
529 PL_XPosix_ptrs[classnum], &flags);
532 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
534 TRUE /* is UTF */ ));
537 switch ((_char_class_number) classnum) {
539 case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
541 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
542 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
543 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
544 default: return 0; /* Things like CNTRL are always
548 assert(0); /* NOTREACHED */
553 * pregexec and friends
556 #ifndef PERL_IN_XSUB_RE
558 - pregexec - match a regexp against a string
561 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
562 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
563 /* stringarg: the point in the string at which to begin matching */
564 /* strend: pointer to null at end of string */
565 /* strbeg: real beginning of string */
566 /* minend: end of match must be >= minend bytes after stringarg. */
567 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
568 * itself is accessed via the pointers above */
569 /* nosave: For optimizations. */
571 PERL_ARGS_ASSERT_PREGEXEC;
574 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
575 nosave ? 0 : REXEC_COPY_STR);
581 /* re_intuit_start():
583 * Based on some optimiser hints, try to find the earliest position in the
584 * string where the regex could match.
586 * rx: the regex to match against
587 * sv: the SV being matched: only used for utf8 flag; the string
588 * itself is accessed via the pointers below. Note that on
589 * something like an overloaded SV, SvPOK(sv) may be false
590 * and the string pointers may point to something unrelated to
592 * strbeg: real beginning of string
593 * strpos: the point in the string at which to begin matching
594 * strend: pointer to the byte following the last char of the string
595 * flags currently unused; set to 0
596 * data: currently unused; set to NULL
598 * The basic idea of re_intuit_start() is to use some known information
599 * about the pattern, namely:
601 * a) the longest known anchored substring (i.e. one that's at a
602 * constant offset from the beginning of the pattern; but not
603 * necessarily at a fixed offset from the beginning of the
605 * b) the longest floating substring (i.e. one that's not at a constant
606 * offset from the beginning of the pattern);
607 * c) Whether the pattern is anchored to the string; either
608 * an absolute anchor: /^../, or anchored to \n: /^.../m,
609 * or anchored to pos(): /\G/;
610 * d) A start class: a real or synthetic character class which
611 * represents which characters are legal at the start of the pattern;
613 * to either quickly reject the match, or to find the earliest position
614 * within the string at which the pattern might match, thus avoiding
615 * running the full NFA engine at those earlier locations, only to
616 * eventually fail and retry further along.
618 * Returns NULL if the pattern can't match, or returns the address within
619 * the string which is the earliest place the match could occur.
621 * The longest of the anchored and floating substrings is called 'check'
622 * and is checked first. The other is called 'other' and is checked
623 * second. The 'other' substring may not be present. For example,
625 * /(abc|xyz)ABC\d{0,3}DEFG/
629 * check substr (float) = "DEFG", offset 6..9 chars
630 * other substr (anchored) = "ABC", offset 3..3 chars
633 * Be aware that during the course of this function, sometimes 'anchored'
634 * refers to a substring being anchored relative to the start of the
635 * pattern, and sometimes to the pattern itself being anchored relative to
636 * the string. For example:
638 * /\dabc/: "abc" is anchored to the pattern;
639 * /^\dabc/: "abc" is anchored to the pattern and the string;
640 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
641 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
642 * but the pattern is anchored to the string.
646 Perl_re_intuit_start(pTHX_
649 const char * const strbeg,
653 re_scream_pos_data *data)
656 struct regexp *const prog = ReANY(rx);
657 SSize_t start_shift = prog->check_offset_min;
658 /* Should be nonnegative! */
659 SSize_t end_shift = 0;
660 /* current lowest pos in string where the regex can start matching */
661 char *rx_origin = strpos;
663 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
664 U8 other_ix = 1 - prog->substrs->check_ix;
666 char *other_last = strpos;/* latest pos 'other' substr already checked to */
667 char *check_at = NULL; /* check substr found at this pos */
668 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
669 RXi_GET_DECL(prog,progi);
670 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
671 regmatch_info *const reginfo = ®info_buf;
672 GET_RE_DEBUG_FLAGS_DECL;
674 PERL_ARGS_ASSERT_RE_INTUIT_START;
675 PERL_UNUSED_ARG(flags);
676 PERL_UNUSED_ARG(data);
678 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
679 "Intuit: trying to determine minimum start position...\n"));
681 /* for now, assume that all substr offsets are positive. If at some point
682 * in the future someone wants to do clever things with look-behind and
683 * -ve offsets, they'll need to fix up any code in this function
684 * which uses these offsets. See the thread beginning
685 * <20140113145929.GF27210@iabyn.com>
687 assert(prog->substrs->data[0].min_offset >= 0);
688 assert(prog->substrs->data[0].max_offset >= 0);
689 assert(prog->substrs->data[1].min_offset >= 0);
690 assert(prog->substrs->data[1].max_offset >= 0);
691 assert(prog->substrs->data[2].min_offset >= 0);
692 assert(prog->substrs->data[2].max_offset >= 0);
694 /* for now, assume that if both present, that the floating substring
695 * doesn't start before the anchored substring.
696 * If you break this assumption (e.g. doing better optimisations
697 * with lookahead/behind), then you'll need to audit the code in this
698 * function carefully first
701 ! ( (prog->anchored_utf8 || prog->anchored_substr)
702 && (prog->float_utf8 || prog->float_substr))
703 || (prog->float_min_offset >= prog->anchored_offset));
705 /* byte rather than char calculation for efficiency. It fails
706 * to quickly reject some cases that can't match, but will reject
707 * them later after doing full char arithmetic */
708 if (prog->minlen > strend - strpos) {
709 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
710 " String too short...\n"));
714 reginfo->is_utf8_target = cBOOL(utf8_target);
715 reginfo->info_aux = NULL;
716 reginfo->strbeg = strbeg;
717 reginfo->strend = strend;
718 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
720 /* not actually used within intuit, but zero for safety anyway */
721 reginfo->poscache_maxiter = 0;
724 if (!prog->check_utf8 && prog->check_substr)
725 to_utf8_substr(prog);
726 check = prog->check_utf8;
728 if (!prog->check_substr && prog->check_utf8) {
729 if (! to_byte_substr(prog)) {
730 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
733 check = prog->check_substr;
736 /* dump the various substring data */
737 DEBUG_OPTIMISE_MORE_r({
739 for (i=0; i<=2; i++) {
740 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
741 : prog->substrs->data[i].substr);
745 PerlIO_printf(Perl_debug_log,
746 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
747 " useful=%"IVdf" utf8=%d [%s]\n",
749 (IV)prog->substrs->data[i].min_offset,
750 (IV)prog->substrs->data[i].max_offset,
751 (IV)prog->substrs->data[i].end_shift,
758 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
760 /* ml_anch: check after \n?
762 * A note about IMPLICIT: on an un-anchored pattern beginning
763 * with /.*.../, these flags will have been added by the
765 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
766 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
768 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
769 && !(prog->intflags & PREGf_IMPLICIT);
771 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
772 /* we are only allowed to match at BOS or \G */
774 /* trivially reject if there's a BOS anchor and we're not at BOS.
776 * Note that we don't try to do a similar quick reject for
777 * \G, since generally the caller will have calculated strpos
778 * based on pos() and gofs, so the string is already correctly
779 * anchored by definition; and handling the exceptions would
780 * be too fiddly (e.g. REXEC_IGNOREPOS).
782 if ( strpos != strbeg
783 && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
785 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
786 " Not at start...\n"));
790 /* in the presence of an anchor, the anchored (relative to the
791 * start of the regex) substr must also be anchored relative
792 * to strpos. So quickly reject if substr isn't found there.
793 * This works for \G too, because the caller will already have
794 * subtracted gofs from pos, and gofs is the offset from the
795 * \G to the start of the regex. For example, in /.abc\Gdef/,
796 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
797 * caller will have set strpos=pos()-4; we look for the substr
798 * at position pos()-4+1, which lines up with the "a" */
800 if (prog->check_offset_min == prog->check_offset_max
801 && !(prog->intflags & PREGf_CANY_SEEN))
803 /* Substring at constant offset from beg-of-str... */
804 SSize_t slen = SvCUR(check);
805 char *s = HOP3c(strpos, prog->check_offset_min, strend);
807 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
808 " Looking for check substr at fixed offset %"IVdf"...\n",
809 (IV)prog->check_offset_min));
812 /* In this case, the regex is anchored at the end too.
813 * Unless it's a multiline match, the lengths must match
814 * exactly, give or take a \n. NB: slen >= 1 since
815 * the last char of check is \n */
817 && ( strend - s > slen
818 || strend - s < slen - 1
819 || (strend - s == slen && strend[-1] != '\n')))
821 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
822 " String too long...\n"));
825 /* Now should match s[0..slen-2] */
828 if (slen && (*SvPVX_const(check) != *s
829 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
831 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
832 " String not equal...\n"));
837 goto success_at_start;
842 end_shift = prog->check_end_shift;
844 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
846 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
847 (IV)end_shift, RX_PRECOMP(prog));
852 /* This is the (re)entry point of the main loop in this function.
853 * The goal of this loop is to:
854 * 1) find the "check" substring in the region rx_origin..strend
855 * (adjusted by start_shift / end_shift). If not found, reject
857 * 2) If it exists, look for the "other" substr too if defined; for
858 * example, if the check substr maps to the anchored substr, then
859 * check the floating substr, and vice-versa. If not found, go
860 * back to (1) with rx_origin suitably incremented.
861 * 3) If we find an rx_origin position that doesn't contradict
862 * either of the substrings, then check the possible additional
863 * constraints on rx_origin of /^.../m or a known start class.
864 * If these fail, then depending on which constraints fail, jump
865 * back to here, or to various other re-entry points further along
866 * that skip some of the first steps.
867 * 4) If we pass all those tests, update the BmUSEFUL() count on the
868 * substring. If the start position was determined to be at the
869 * beginning of the string - so, not rejected, but not optimised,
870 * since we have to run regmatch from position 0 - decrement the
871 * BmUSEFUL() count. Otherwise increment it.
875 /* first, look for the 'check' substring */
881 DEBUG_OPTIMISE_MORE_r({
882 PerlIO_printf(Perl_debug_log,
883 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
884 " Start shift: %"IVdf" End shift %"IVdf
885 " Real end Shift: %"IVdf"\n",
886 (IV)(rx_origin - strpos),
887 (IV)prog->check_offset_min,
890 (IV)prog->check_end_shift);
893 if (prog->intflags & PREGf_CANY_SEEN) {
894 start_point= (U8*)(rx_origin + start_shift);
895 end_point= (U8*)(strend - end_shift);
896 if (start_point > end_point)
899 end_point = HOP3(strend, -end_shift, strbeg);
900 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
906 /* If the regex is absolutely anchored to either the start of the
907 * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
908 * check_offset_max represents an upper bound on the string where
909 * the substr could start. For the ANCH_GPOS case, we assume that
910 * the caller of intuit will have already set strpos to
911 * pos()-gofs, so in this case strpos + offset_max will still be
912 * an upper bound on the substr.
915 && prog->intflags & PREGf_ANCH
916 && prog->check_offset_max != SSize_t_MAX)
918 SSize_t len = SvCUR(check) - !!SvTAIL(check);
919 const char * const anchor =
920 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
922 /* do a bytes rather than chars comparison. It's conservative;
923 * so it skips doing the HOP if the result can't possibly end
924 * up earlier than the old value of end_point.
926 if ((char*)end_point - anchor > prog->check_offset_max) {
927 end_point = HOP3lim((U8*)anchor,
928 prog->check_offset_max,
934 DEBUG_OPTIMISE_MORE_r({
935 PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
936 (int)(end_point - start_point),
937 (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
941 check_at = fbm_instr( start_point, end_point,
942 check, multiline ? FBMrf_MULTILINE : 0);
944 /* Update the count-of-usability, remove useless subpatterns,
948 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
949 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
950 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
951 (check_at ? "Found" : "Did not find"),
952 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
953 ? "anchored" : "floating"),
956 (check_at ? " at offset " : "...\n") );
961 /* Finish the diagnostic message */
962 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
964 /* set rx_origin to the minimum position where the regex could start
965 * matching, given the constraint of the just-matched check substring.
966 * But don't set it lower than previously.
969 if (check_at - rx_origin > prog->check_offset_max)
970 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
974 /* now look for the 'other' substring if defined */
976 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
977 : prog->substrs->data[other_ix].substr)
979 /* Take into account the "other" substring. */
983 struct reg_substr_datum *other;
986 other = &prog->substrs->data[other_ix];
988 /* if "other" is anchored:
989 * we've previously found a floating substr starting at check_at.
990 * This means that the regex origin must lie somewhere
991 * between min (rx_origin): HOP3(check_at, -check_offset_max)
992 * and max: HOP3(check_at, -check_offset_min)
993 * (except that min will be >= strpos)
994 * So the fixed substr must lie somewhere between
995 * HOP3(min, anchored_offset)
996 * HOP3(max, anchored_offset) + SvCUR(substr)
999 /* if "other" is floating
1000 * Calculate last1, the absolute latest point where the
1001 * floating substr could start in the string, ignoring any
1002 * constraints from the earlier fixed match. It is calculated
1005 * strend - prog->minlen (in chars) is the absolute latest
1006 * position within the string where the origin of the regex
1007 * could appear. The latest start point for the floating
1008 * substr is float_min_offset(*) on from the start of the
1009 * regex. last1 simply combines thee two offsets.
1011 * (*) You might think the latest start point should be
1012 * float_max_offset from the regex origin, and technically
1013 * you'd be correct. However, consider
1015 * Here, float min, max are 3,5 and minlen is 7.
1016 * This can match either
1020 * In the first case, the regex matches minlen chars; in the
1021 * second, minlen+1, in the third, minlen+2.
1022 * In the first case, the floating offset is 3 (which equals
1023 * float_min), in the second, 4, and in the third, 5 (which
1024 * equals float_max). In all cases, the floating string bcd
1025 * can never start more than 4 chars from the end of the
1026 * string, which equals minlen - float_min. As the substring
1027 * starts to match more than float_min from the start of the
1028 * regex, it makes the regex match more than minlen chars,
1029 * and the two cancel each other out. So we can always use
1030 * float_min - minlen, rather than float_max - minlen for the
1031 * latest position in the string.
1033 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1034 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1037 assert(prog->minlen >= other->min_offset);
1038 last1 = HOP3c(strend,
1039 other->min_offset - prog->minlen, strbeg);
1041 if (other_ix) {/* i.e. if (other-is-float) */
1042 /* last is the latest point where the floating substr could
1043 * start, *given* any constraints from the earlier fixed
1044 * match. This constraint is that the floating string starts
1045 * <= float_max_offset chars from the regex origin (rx_origin).
1046 * If this value is less than last1, use it instead.
1048 assert(rx_origin <= last1);
1050 /* this condition handles the offset==infinity case, and
1051 * is a short-cut otherwise. Although it's comparing a
1052 * byte offset to a char length, it does so in a safe way,
1053 * since 1 char always occupies 1 or more bytes,
1054 * so if a string range is (last1 - rx_origin) bytes,
1055 * it will be less than or equal to (last1 - rx_origin)
1056 * chars; meaning it errs towards doing the accurate HOP3
1057 * rather than just using last1 as a short-cut */
1058 (last1 - rx_origin) < other->max_offset
1060 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1063 assert(strpos + start_shift <= check_at);
1064 last = HOP4c(check_at, other->min_offset - start_shift,
1068 s = HOP3c(rx_origin, other->min_offset, strend);
1069 if (s < other_last) /* These positions already checked */
1072 must = utf8_target ? other->utf8_substr : other->substr;
1073 assert(SvPOK(must));
1076 (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
1078 multiline ? FBMrf_MULTILINE : 0
1081 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1082 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1083 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1084 s ? "Found" : "Contradicts",
1085 other_ix ? "floating" : "anchored",
1086 quoted, RE_SV_TAIL(must));
1091 /* last1 is latest possible substr location. If we didn't
1092 * find it before there, we never will */
1093 if (last >= last1) {
1094 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1095 ", giving up...\n"));
1099 /* try to find the check substr again at a later
1100 * position. Maybe next time we'll find the "other" substr
1102 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1103 ", trying %s at offset %ld...\n",
1104 (other_ix ? "floating" : "anchored"),
1105 (long)(HOP3c(check_at, 1, strend) - strpos)));
1107 other_last = HOP3c(last, 1, strend) /* highest failure */;
1109 other_ix /* i.e. if other-is-float */
1110 ? HOP3c(rx_origin, 1, strend)
1111 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1115 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
1116 (long)(s - strpos)));
1118 if (other_ix) { /* if (other-is-float) */
1119 /* other_last is set to s, not s+1, since its possible for
1120 * a floating substr to fail first time, then succeed
1121 * second time at the same floating position; e.g.:
1122 * "-AB--AABZ" =~ /\wAB\d*Z/
1123 * The first time round, anchored and float match at
1124 * "-(AB)--AAB(Z)" then fail on the initial \w character
1125 * class. Second time round, they match at "-AB--A(AB)(Z)".
1130 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1131 other_last = HOP3c(s, 1, strend);
1136 DEBUG_OPTIMISE_MORE_r(
1137 PerlIO_printf(Perl_debug_log,
1138 " Check-only match: offset min:%"IVdf" max:%"IVdf
1139 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1140 " strend-strpos:%"IVdf"\n",
1141 (IV)prog->check_offset_min,
1142 (IV)prog->check_offset_max,
1143 (IV)(check_at-strpos),
1144 (IV)(rx_origin-strpos),
1145 (IV)(rx_origin-check_at),
1151 postprocess_substr_matches:
1153 /* handle the extra constraint of /^.../m if present */
1155 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1158 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1159 " looking for /^/m anchor"));
1161 /* we have failed the constraint of a \n before rx_origin.
1162 * Find the next \n, if any, even if it's beyond the current
1163 * anchored and/or floating substrings. Whether we should be
1164 * scanning ahead for the next \n or the next substr is debatable.
1165 * On the one hand you'd expect rare substrings to appear less
1166 * often than \n's. On the other hand, searching for \n means
1167 * we're effectively flipping been check_substr and "\n" on each
1168 * iteration as the current "rarest" string candidate, which
1169 * means for example that we'll quickly reject the whole string if
1170 * hasn't got a \n, rather than trying every substr position
1174 s = HOP3c(strend, - prog->minlen, strpos);
1175 if (s <= rx_origin ||
1176 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1178 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1179 " Did not find /%s^%s/m...\n",
1180 PL_colors[0], PL_colors[1]));
1184 /* earliest possible origin is 1 char after the \n.
1185 * (since *rx_origin == '\n', it's safe to ++ here rather than
1186 * HOP(rx_origin, 1)) */
1189 if (prog->substrs->check_ix == 0 /* check is anchored */
1190 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1192 /* Position contradicts check-string; either because
1193 * check was anchored (and thus has no wiggle room),
1194 * or check was float and rx_origin is above the float range */
1195 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1196 " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
1197 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1201 /* if we get here, the check substr must have been float,
1202 * is in range, and we may or may not have had an anchored
1203 * "other" substr which still contradicts */
1204 assert(prog->substrs->check_ix); /* check is float */
1206 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1207 /* whoops, the anchored "other" substr exists, so we still
1208 * contradict. On the other hand, the float "check" substr
1209 * didn't contradict, so just retry the anchored "other"
1211 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1212 " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
1213 PL_colors[0], PL_colors[1],
1214 (long)(rx_origin - strpos),
1215 (long)(rx_origin - strpos + prog->anchored_offset)));
1216 goto do_other_substr;
1219 /* success: we don't contradict the found floating substring
1220 * (and there's no anchored substr). */
1221 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1222 " Found /%s^%s/m at offset %ld...\n",
1223 PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
1226 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1227 " (multiline anchor test skipped)\n"));
1233 /* if we have a starting character class, then test that extra constraint.
1234 * (trie stclasses are too expensive to use here, we are better off to
1235 * leave it to regmatch itself) */
1237 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1238 const U8* const str = (U8*)STRING(progi->regstclass);
1240 /* XXX this value could be pre-computed */
1241 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1242 ? (reginfo->is_utf8_pat
1243 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1244 : STR_LEN(progi->regstclass))
1248 /* latest pos that a matching float substr constrains rx start to */
1249 char *rx_max_float = NULL;
1251 /* if the current rx_origin is anchored, either by satisfying an
1252 * anchored substring constraint, or a /^.../m constraint, then we
1253 * can reject the current origin if the start class isn't found
1254 * at the current position. If we have a float-only match, then
1255 * rx_origin is constrained to a range; so look for the start class
1256 * in that range. if neither, then look for the start class in the
1257 * whole rest of the string */
1259 /* XXX DAPM it's not clear what the minlen test is for, and why
1260 * it's not used in the floating case. Nothing in the test suite
1261 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1262 * Here are some old comments, which may or may not be correct:
1264 * minlen == 0 is possible if regstclass is \b or \B,
1265 * and the fixed substr is ''$.
1266 * Since minlen is already taken into account, rx_origin+1 is
1267 * before strend; accidentally, minlen >= 1 guaranties no false
1268 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1269 * 0) below assumes that regstclass does not come from lookahead...
1270 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1271 * This leaves EXACTF-ish only, which are dealt with in
1275 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1276 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1277 else if (prog->float_substr || prog->float_utf8) {
1278 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1279 endpos= HOP3c(rx_max_float, cl_l, strend);
1284 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1285 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1286 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1287 (IV)start_shift, (IV)(check_at - strbeg),
1288 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1290 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1293 if (endpos == strend) {
1294 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1295 " Could not match STCLASS...\n") );
1298 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1299 " This position contradicts STCLASS...\n") );
1300 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1301 && !(prog->intflags & PREGf_IMPLICIT))
1304 /* Contradict one of substrings */
1305 if (prog->anchored_substr || prog->anchored_utf8) {
1306 if (prog->substrs->check_ix == 1) { /* check is float */
1307 /* Have both, check_string is floating */
1308 assert(rx_origin + start_shift <= check_at);
1309 if (rx_origin + start_shift != check_at) {
1310 /* not at latest position float substr could match:
1311 * Recheck anchored substring, but not floating.
1312 * The condition above is in bytes rather than
1313 * chars for efficiency. It's conservative, in
1314 * that it errs on the side of doing 'goto
1315 * do_other_substr', where a more accurate
1316 * char-based calculation will be done */
1317 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1318 " Looking for anchored substr starting at offset %ld...\n",
1319 (long)(other_last - strpos)) );
1320 goto do_other_substr;
1328 /* In the presence of ml_anch, we might be able to
1329 * find another \n without breaking the current float
1332 /* strictly speaking this should be HOP3c(..., 1, ...),
1333 * but since we goto a block of code that's going to
1334 * search for the next \n if any, its safe here */
1336 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1337 " Looking for /%s^%s/m starting at offset %ld...\n",
1338 PL_colors[0], PL_colors[1],
1339 (long)(rx_origin - strpos)) );
1340 goto postprocess_substr_matches;
1343 /* strictly speaking this can never be true; but might
1344 * be if we ever allow intuit without substrings */
1345 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1348 rx_origin = rx_max_float;
1351 /* at this point, any matching substrings have been
1352 * contradicted. Start again... */
1354 rx_origin = HOP3c(rx_origin, 1, strend);
1356 /* uses bytes rather than char calculations for efficiency.
1357 * It's conservative: it errs on the side of doing 'goto restart',
1358 * where there is code that does a proper char-based test */
1359 if (rx_origin + start_shift + end_shift > strend) {
1360 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1361 " Could not match STCLASS...\n") );
1364 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1365 " Looking for %s substr starting at offset %ld...\n",
1366 (prog->substrs->check_ix ? "floating" : "anchored"),
1367 (long)(rx_origin + start_shift - strpos)) );
1373 if (rx_origin != s) {
1374 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1375 " By STCLASS: moving %ld --> %ld\n",
1376 (long)(rx_origin - strpos), (long)(s - strpos))
1380 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1381 " Does not contradict STCLASS...\n");
1386 /* Decide whether using the substrings helped */
1388 if (rx_origin != strpos) {
1389 /* Fixed substring is found far enough so that the match
1390 cannot start at strpos. */
1392 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1393 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1396 /* The found rx_origin position does not prohibit matching at
1397 * strpos, so calling intuit didn't gain us anything. Decrement
1398 * the BmUSEFUL() count on the check substring, and if we reach
1400 if (!(prog->intflags & PREGf_NAUGHTY)
1402 prog->check_utf8 /* Could be deleted already */
1403 && --BmUSEFUL(prog->check_utf8) < 0
1404 && (prog->check_utf8 == prog->float_utf8)
1406 prog->check_substr /* Could be deleted already */
1407 && --BmUSEFUL(prog->check_substr) < 0
1408 && (prog->check_substr == prog->float_substr)
1411 /* If flags & SOMETHING - do not do it many times on the same match */
1412 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1413 /* XXX Does the destruction order has to change with utf8_target? */
1414 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1415 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1416 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1417 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1418 check = NULL; /* abort */
1419 /* XXXX This is a remnant of the old implementation. It
1420 looks wasteful, since now INTUIT can use many
1421 other heuristics. */
1422 prog->extflags &= ~RXf_USE_INTUIT;
1426 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1427 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1428 PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
1432 fail_finish: /* Substring not found */
1433 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1434 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1436 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1437 PL_colors[4], PL_colors[5]));
1442 #define DECL_TRIE_TYPE(scan) \
1443 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1444 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
1445 trie_type = ((scan->flags == EXACT) \
1446 ? (utf8_target ? trie_utf8 : trie_plain) \
1447 : (scan->flags == EXACTFA) \
1448 ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
1449 : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
1451 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1454 U8 flags = FOLD_FLAGS_FULL; \
1455 switch (trie_type) { \
1456 case trie_utf8_exactfa_fold: \
1457 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1458 /* FALL THROUGH */ \
1459 case trie_utf8_fold: \
1460 if ( foldlen>0 ) { \
1461 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1466 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
1467 len = UTF8SKIP(uc); \
1468 skiplen = UNISKIP( uvc ); \
1469 foldlen -= skiplen; \
1470 uscan = foldbuf + skiplen; \
1473 case trie_latin_utf8_exactfa_fold: \
1474 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1475 /* FALL THROUGH */ \
1476 case trie_latin_utf8_fold: \
1477 if ( foldlen>0 ) { \
1478 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1484 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1485 skiplen = UNISKIP( uvc ); \
1486 foldlen -= skiplen; \
1487 uscan = foldbuf + skiplen; \
1491 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1498 charid = trie->charmap[ uvc ]; \
1502 if (widecharmap) { \
1503 SV** const svpp = hv_fetch(widecharmap, \
1504 (char*)&uvc, sizeof(UV), 0); \
1506 charid = (U16)SvIV(*svpp); \
1511 #define REXEC_FBC_EXACTISH_SCAN(CoNd) \
1515 && (ln == 1 || folder(s, pat_string, ln)) \
1516 && (reginfo->intuit || regtry(reginfo, &s)) )\
1522 #define REXEC_FBC_UTF8_SCAN(CoDe) \
1524 while (s < strend) { \
1530 #define REXEC_FBC_SCAN(CoDe) \
1532 while (s < strend) { \
1538 #define REXEC_FBC_UTF8_CLASS_SCAN(CoNd) \
1539 REXEC_FBC_UTF8_SCAN( \
1541 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1550 #define REXEC_FBC_CLASS_SCAN(CoNd) \
1553 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1562 #define REXEC_FBC_TRYIT \
1563 if ((reginfo->intuit || regtry(reginfo, &s))) \
1566 #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \
1567 if (utf8_target) { \
1568 REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \
1571 REXEC_FBC_CLASS_SCAN(CoNd); \
1574 #define DUMP_EXEC_POS(li,s,doutf8) \
1575 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1579 #define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1580 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1581 tmp = TEST_NON_UTF8(tmp); \
1582 REXEC_FBC_UTF8_SCAN( \
1583 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1592 #define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \
1593 if (s == reginfo->strbeg) { \
1597 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1598 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1599 0, UTF8_ALLOW_DEFAULT); \
1602 LOAD_UTF8_CHARCLASS_ALNUM(); \
1603 REXEC_FBC_UTF8_SCAN( \
1604 if (tmp == ! (TeSt2_UtF8)) { \
1613 /* The only difference between the BOUND and NBOUND cases is that
1614 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1615 * NBOUND. This is accomplished by passing it in either the if or else clause,
1616 * with the other one being empty */
1617 #define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1618 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1620 #define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1621 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1623 #define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1624 FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1626 #define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \
1627 FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1630 /* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to
1631 * be passed in completely with the variable name being tested, which isn't
1632 * such a clean interface, but this is easier to read than it was before. We
1633 * are looking for the boundary (or non-boundary between a word and non-word
1634 * character. The utf8 and non-utf8 cases have the same logic, but the details
1635 * must be different. Find the "wordness" of the character just prior to this
1636 * one, and compare it with the wordness of this one. If they differ, we have
1637 * a boundary. At the beginning of the string, pretend that the previous
1638 * character was a new-line */
1639 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1640 if (utf8_target) { \
1643 else { /* Not utf8 */ \
1644 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1645 tmp = TEST_NON_UTF8(tmp); \
1647 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1656 if ((!prog->minlen && tmp) && (reginfo->intuit || regtry(reginfo, &s))) \
1659 /* We know what class REx starts with. Try to find this position... */
1660 /* if reginfo->intuit, its a dryrun */
1661 /* annoyingly all the vars in this routine have different names from their counterparts
1662 in regmatch. /grrr */
1665 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1666 const char *strend, regmatch_info *reginfo)
1669 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1670 char *pat_string; /* The pattern's exactish string */
1671 char *pat_end; /* ptr to end char of pat_string */
1672 re_fold_t folder; /* Function for computing non-utf8 folds */
1673 const U8 *fold_array; /* array for folding ords < 256 */
1679 I32 tmp = 1; /* Scratch variable? */
1680 const bool utf8_target = reginfo->is_utf8_target;
1681 UV utf8_fold_flags = 0;
1682 const bool is_utf8_pat = reginfo->is_utf8_pat;
1683 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1684 with a result inverts that result, as 0^1 =
1686 _char_class_number classnum;
1688 RXi_GET_DECL(prog,progi);
1690 PERL_ARGS_ASSERT_FIND_BYCLASS;
1692 /* We know what class it must start with. */
1696 REXEC_FBC_UTF8_CLASS_SCAN(
1697 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1700 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1705 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1712 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1713 assert(! is_utf8_pat);
1716 if (is_utf8_pat || utf8_target) {
1717 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1718 goto do_exactf_utf8;
1720 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1721 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1722 goto do_exactf_non_utf8; /* isn't dealt with by these */
1724 case EXACTF: /* This node only generated for non-utf8 patterns */
1725 assert(! is_utf8_pat);
1727 utf8_fold_flags = 0;
1728 goto do_exactf_utf8;
1730 fold_array = PL_fold;
1732 goto do_exactf_non_utf8;
1735 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1736 utf8_fold_flags = FOLDEQ_LOCALE;
1737 goto do_exactf_utf8;
1739 fold_array = PL_fold_locale;
1740 folder = foldEQ_locale;
1741 goto do_exactf_non_utf8;
1745 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1747 goto do_exactf_utf8;
1750 if (is_utf8_pat || utf8_target) {
1751 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1752 goto do_exactf_utf8;
1755 /* Any 'ss' in the pattern should have been replaced by regcomp,
1756 * so we don't have to worry here about this single special case
1757 * in the Latin1 range */
1758 fold_array = PL_fold_latin1;
1759 folder = foldEQ_latin1;
1763 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1764 are no glitches with fold-length differences
1765 between the target string and pattern */
1767 /* The idea in the non-utf8 EXACTF* cases is to first find the
1768 * first character of the EXACTF* node and then, if necessary,
1769 * case-insensitively compare the full text of the node. c1 is the
1770 * first character. c2 is its fold. This logic will not work for
1771 * Unicode semantics and the german sharp ss, which hence should
1772 * not be compiled into a node that gets here. */
1773 pat_string = STRING(c);
1774 ln = STR_LEN(c); /* length to match in octets/bytes */
1776 /* We know that we have to match at least 'ln' bytes (which is the
1777 * same as characters, since not utf8). If we have to match 3
1778 * characters, and there are only 2 availabe, we know without
1779 * trying that it will fail; so don't start a match past the
1780 * required minimum number from the far end */
1781 e = HOP3c(strend, -((SSize_t)ln), s);
1783 if (reginfo->intuit && e < s) {
1784 e = s; /* Due to minlen logic of intuit() */
1788 c2 = fold_array[c1];
1789 if (c1 == c2) { /* If char and fold are the same */
1790 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1793 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1801 /* If one of the operands is in utf8, we can't use the simpler folding
1802 * above, due to the fact that many different characters can have the
1803 * same fold, or portion of a fold, or different- length fold */
1804 pat_string = STRING(c);
1805 ln = STR_LEN(c); /* length to match in octets/bytes */
1806 pat_end = pat_string + ln;
1807 lnc = is_utf8_pat /* length to match in characters */
1808 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1811 /* We have 'lnc' characters to match in the pattern, but because of
1812 * multi-character folding, each character in the target can match
1813 * up to 3 characters (Unicode guarantees it will never exceed
1814 * this) if it is utf8-encoded; and up to 2 if not (based on the
1815 * fact that the Latin 1 folds are already determined, and the
1816 * only multi-char fold in that range is the sharp-s folding to
1817 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1818 * string character. Adjust lnc accordingly, rounding up, so that
1819 * if we need to match at least 4+1/3 chars, that really is 5. */
1820 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1821 lnc = (lnc + expansion - 1) / expansion;
1823 /* As in the non-UTF8 case, if we have to match 3 characters, and
1824 * only 2 are left, it's guaranteed to fail, so don't start a
1825 * match that would require us to go beyond the end of the string
1827 e = HOP3c(strend, -((SSize_t)lnc), s);
1829 if (reginfo->intuit && e < s) {
1830 e = s; /* Due to minlen logic of intuit() */
1833 /* XXX Note that we could recalculate e to stop the loop earlier,
1834 * as the worst case expansion above will rarely be met, and as we
1835 * go along we would usually find that e moves further to the left.
1836 * This would happen only after we reached the point in the loop
1837 * where if there were no expansion we should fail. Unclear if
1838 * worth the expense */
1841 char *my_strend= (char *)strend;
1842 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1843 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1844 && (reginfo->intuit || regtry(reginfo, &s)) )
1848 s += (utf8_target) ? UTF8SKIP(s) : 1;
1853 FBC_BOUND(isWORDCHAR_LC,
1854 isWORDCHAR_LC_uvchr(tmp),
1855 isWORDCHAR_LC_utf8((U8*)s));
1858 FBC_NBOUND(isWORDCHAR_LC,
1859 isWORDCHAR_LC_uvchr(tmp),
1860 isWORDCHAR_LC_utf8((U8*)s));
1863 FBC_BOUND(isWORDCHAR,
1864 isWORDCHAR_uni(tmp),
1865 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1868 FBC_BOUND_NOLOAD(isWORDCHAR_A,
1870 isWORDCHAR_A((U8*)s));
1873 FBC_NBOUND(isWORDCHAR,
1874 isWORDCHAR_uni(tmp),
1875 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1878 FBC_NBOUND_NOLOAD(isWORDCHAR_A,
1880 isWORDCHAR_A((U8*)s));
1883 FBC_BOUND(isWORDCHAR_L1,
1884 isWORDCHAR_uni(tmp),
1885 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1888 FBC_NBOUND(isWORDCHAR_L1,
1889 isWORDCHAR_uni(tmp),
1890 cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
1893 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
1894 is_LNBREAK_latin1_safe(s, strend)
1898 /* The argument to all the POSIX node types is the class number to pass to
1899 * _generic_isCC() to build a mask for searching in PL_charclass[] */
1906 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
1907 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
1922 /* The complement of something that matches only ASCII matches all
1923 * UTF-8 variant code points, plus everything in ASCII that isn't
1925 REXEC_FBC_UTF8_CLASS_SCAN(! UTF8_IS_INVARIANT(*s)
1926 || ! _generic_isCC_A(*s, FLAGS(c)));
1935 /* Don't need to worry about utf8, as it can match only a single
1936 * byte invariant character. */
1937 REXEC_FBC_CLASS_SCAN(
1938 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
1946 if (! utf8_target) {
1947 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
1953 classnum = (_char_class_number) FLAGS(c);
1954 if (classnum < _FIRST_NON_SWASH_CC) {
1955 while (s < strend) {
1957 /* We avoid loading in the swash as long as possible, but
1958 * should we have to, we jump to a separate loop. This
1959 * extra 'if' statement is what keeps this code from being
1960 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
1961 if (UTF8_IS_ABOVE_LATIN1(*s)) {
1962 goto found_above_latin1;
1964 if ((UTF8_IS_INVARIANT(*s)
1965 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
1967 || (UTF8_IS_DOWNGRADEABLE_START(*s)
1968 && to_complement ^ cBOOL(
1969 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
1973 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1985 else switch (classnum) { /* These classes are implemented as
1987 case _CC_ENUM_SPACE: /* XXX would require separate code if we
1988 revert the change of \v matching this */
1991 case _CC_ENUM_PSXSPC:
1992 REXEC_FBC_UTF8_CLASS_SCAN(
1993 to_complement ^ cBOOL(isSPACE_utf8(s)));
1996 case _CC_ENUM_BLANK:
1997 REXEC_FBC_UTF8_CLASS_SCAN(
1998 to_complement ^ cBOOL(isBLANK_utf8(s)));
2001 case _CC_ENUM_XDIGIT:
2002 REXEC_FBC_UTF8_CLASS_SCAN(
2003 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2006 case _CC_ENUM_VERTSPACE:
2007 REXEC_FBC_UTF8_CLASS_SCAN(
2008 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2011 case _CC_ENUM_CNTRL:
2012 REXEC_FBC_UTF8_CLASS_SCAN(
2013 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2017 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2018 assert(0); /* NOTREACHED */
2023 found_above_latin1: /* Here we have to load a swash to get the result
2024 for the current code point */
2025 if (! PL_utf8_swash_ptrs[classnum]) {
2026 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2027 PL_utf8_swash_ptrs[classnum] =
2028 _core_swash_init("utf8",
2031 PL_XPosix_ptrs[classnum], &flags);
2034 /* This is a copy of the loop above for swash classes, though using the
2035 * FBC macro instead of being expanded out. Since we've loaded the
2036 * swash, we don't have to check for that each time through the loop */
2037 REXEC_FBC_UTF8_CLASS_SCAN(
2038 to_complement ^ cBOOL(_generic_utf8(
2041 swash_fetch(PL_utf8_swash_ptrs[classnum],
2049 /* what trie are we using right now */
2050 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2051 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2052 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2054 const char *last_start = strend - trie->minlen;
2056 const char *real_start = s;
2058 STRLEN maxlen = trie->maxlen;
2060 U8 **points; /* map of where we were in the input string
2061 when reading a given char. For ASCII this
2062 is unnecessary overhead as the relationship
2063 is always 1:1, but for Unicode, especially
2064 case folded Unicode this is not true. */
2065 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2069 GET_RE_DEBUG_FLAGS_DECL;
2071 /* We can't just allocate points here. We need to wrap it in
2072 * an SV so it gets freed properly if there is a croak while
2073 * running the match */
2076 sv_points=newSV(maxlen * sizeof(U8 *));
2077 SvCUR_set(sv_points,
2078 maxlen * sizeof(U8 *));
2079 SvPOK_on(sv_points);
2080 sv_2mortal(sv_points);
2081 points=(U8**)SvPV_nolen(sv_points );
2082 if ( trie_type != trie_utf8_fold
2083 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2086 bitmap=(U8*)trie->bitmap;
2088 bitmap=(U8*)ANYOF_BITMAP(c);
2090 /* this is the Aho-Corasick algorithm modified a touch
2091 to include special handling for long "unknown char" sequences.
2092 The basic idea being that we use AC as long as we are dealing
2093 with a possible matching char, when we encounter an unknown char
2094 (and we have not encountered an accepting state) we scan forward
2095 until we find a legal starting char.
2096 AC matching is basically that of trie matching, except that when
2097 we encounter a failing transition, we fall back to the current
2098 states "fail state", and try the current char again, a process
2099 we repeat until we reach the root state, state 1, or a legal
2100 transition. If we fail on the root state then we can either
2101 terminate if we have reached an accepting state previously, or
2102 restart the entire process from the beginning if we have not.
2105 while (s <= last_start) {
2106 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2114 U8 *uscan = (U8*)NULL;
2115 U8 *leftmost = NULL;
2117 U32 accepted_word= 0;
2121 while ( state && uc <= (U8*)strend ) {
2123 U32 word = aho->states[ state ].wordnum;
2127 DEBUG_TRIE_EXECUTE_r(
2128 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2129 dump_exec_pos( (char *)uc, c, strend, real_start,
2130 (char *)uc, utf8_target );
2131 PerlIO_printf( Perl_debug_log,
2132 " Scanning for legal start char...\n");
2136 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2140 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2146 if (uc >(U8*)last_start) break;
2150 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2151 if (!leftmost || lpos < leftmost) {
2152 DEBUG_r(accepted_word=word);
2158 points[pointpos++ % maxlen]= uc;
2159 if (foldlen || uc < (U8*)strend) {
2160 REXEC_TRIE_READ_CHAR(trie_type, trie,
2162 uscan, len, uvc, charid, foldlen,
2164 DEBUG_TRIE_EXECUTE_r({
2165 dump_exec_pos( (char *)uc, c, strend,
2166 real_start, s, utf8_target);
2167 PerlIO_printf(Perl_debug_log,
2168 " Charid:%3u CP:%4"UVxf" ",
2180 word = aho->states[ state ].wordnum;
2182 base = aho->states[ state ].trans.base;
2184 DEBUG_TRIE_EXECUTE_r({
2186 dump_exec_pos( (char *)uc, c, strend, real_start,
2188 PerlIO_printf( Perl_debug_log,
2189 "%sState: %4"UVxf", word=%"UVxf,
2190 failed ? " Fail transition to " : "",
2191 (UV)state, (UV)word);
2197 ( ((offset = base + charid
2198 - 1 - trie->uniquecharcount)) >= 0)
2199 && ((U32)offset < trie->lasttrans)
2200 && trie->trans[offset].check == state
2201 && (tmp=trie->trans[offset].next))
2203 DEBUG_TRIE_EXECUTE_r(
2204 PerlIO_printf( Perl_debug_log," - legal\n"));
2209 DEBUG_TRIE_EXECUTE_r(
2210 PerlIO_printf( Perl_debug_log," - fail\n"));
2212 state = aho->fail[state];
2216 /* we must be accepting here */
2217 DEBUG_TRIE_EXECUTE_r(
2218 PerlIO_printf( Perl_debug_log," - accepting\n"));
2227 if (!state) state = 1;
2230 if ( aho->states[ state ].wordnum ) {
2231 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2232 if (!leftmost || lpos < leftmost) {
2233 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2238 s = (char*)leftmost;
2239 DEBUG_TRIE_EXECUTE_r({
2241 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2242 (UV)accepted_word, (IV)(s - real_start)
2245 if (reginfo->intuit || regtry(reginfo, &s)) {
2251 DEBUG_TRIE_EXECUTE_r({
2252 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2255 DEBUG_TRIE_EXECUTE_r(
2256 PerlIO_printf( Perl_debug_log,"No match.\n"));
2265 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2273 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2274 * flags have same meanings as with regexec_flags() */
2277 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2284 struct regexp *const prog = ReANY(rx);
2286 if (flags & REXEC_COPY_STR) {
2290 PerlIO_printf(Perl_debug_log,
2291 "Copy on write: regexp capture, type %d\n",
2294 /* Create a new COW SV to share the match string and store
2295 * in saved_copy, unless the current COW SV in saved_copy
2296 * is valid and suitable for our purpose */
2297 if (( prog->saved_copy
2298 && SvIsCOW(prog->saved_copy)
2299 && SvPOKp(prog->saved_copy)
2302 && SvPVX(sv) == SvPVX(prog->saved_copy)))
2304 /* just reuse saved_copy SV */
2305 if (RXp_MATCH_COPIED(prog)) {
2306 Safefree(prog->subbeg);
2307 RXp_MATCH_COPIED_off(prog);
2311 /* create new COW SV to share string */
2312 RX_MATCH_COPY_FREE(rx);
2313 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2315 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2316 assert (SvPOKp(prog->saved_copy));
2317 prog->sublen = strend - strbeg;
2318 prog->suboffset = 0;
2319 prog->subcoffset = 0;
2324 SSize_t max = strend - strbeg;
2327 if ( (flags & REXEC_COPY_SKIP_POST)
2328 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2329 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2330 ) { /* don't copy $' part of string */
2333 /* calculate the right-most part of the string covered
2334 * by a capture. Due to look-ahead, this may be to
2335 * the right of $&, so we have to scan all captures */
2336 while (n <= prog->lastparen) {
2337 if (prog->offs[n].end > max)
2338 max = prog->offs[n].end;
2342 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2343 ? prog->offs[0].start
2345 assert(max >= 0 && max <= strend - strbeg);
2348 if ( (flags & REXEC_COPY_SKIP_PRE)
2349 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2350 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2351 ) { /* don't copy $` part of string */
2354 /* calculate the left-most part of the string covered
2355 * by a capture. Due to look-behind, this may be to
2356 * the left of $&, so we have to scan all captures */
2357 while (min && n <= prog->lastparen) {
2358 if ( prog->offs[n].start != -1
2359 && prog->offs[n].start < min)
2361 min = prog->offs[n].start;
2365 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2366 && min > prog->offs[0].end
2368 min = prog->offs[0].end;
2372 assert(min >= 0 && min <= max && min <= strend - strbeg);
2375 if (RX_MATCH_COPIED(rx)) {
2376 if (sublen > prog->sublen)
2378 (char*)saferealloc(prog->subbeg, sublen+1);
2381 prog->subbeg = (char*)safemalloc(sublen+1);
2382 Copy(strbeg + min, prog->subbeg, sublen, char);
2383 prog->subbeg[sublen] = '\0';
2384 prog->suboffset = min;
2385 prog->sublen = sublen;
2386 RX_MATCH_COPIED_on(rx);
2388 prog->subcoffset = prog->suboffset;
2389 if (prog->suboffset && utf8_target) {
2390 /* Convert byte offset to chars.
2391 * XXX ideally should only compute this if @-/@+
2392 * has been seen, a la PL_sawampersand ??? */
2394 /* If there's a direct correspondence between the
2395 * string which we're matching and the original SV,
2396 * then we can use the utf8 len cache associated with
2397 * the SV. In particular, it means that under //g,
2398 * sv_pos_b2u() will use the previously cached
2399 * position to speed up working out the new length of
2400 * subcoffset, rather than counting from the start of
2401 * the string each time. This stops
2402 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2403 * from going quadratic */
2404 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2405 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2406 SV_GMAGIC|SV_CONST_RETURN);
2408 prog->subcoffset = utf8_length((U8*)strbeg,
2409 (U8*)(strbeg+prog->suboffset));
2413 RX_MATCH_COPY_FREE(rx);
2414 prog->subbeg = strbeg;
2415 prog->suboffset = 0;
2416 prog->subcoffset = 0;
2417 prog->sublen = strend - strbeg;
2425 - regexec_flags - match a regexp against a string
2428 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2429 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2430 /* stringarg: the point in the string at which to begin matching */
2431 /* strend: pointer to null at end of string */
2432 /* strbeg: real beginning of string */
2433 /* minend: end of match must be >= minend bytes after stringarg. */
2434 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2435 * itself is accessed via the pointers above */
2436 /* data: May be used for some additional optimizations.
2437 Currently unused. */
2438 /* flags: For optimizations. See REXEC_* in regexp.h */
2442 struct regexp *const prog = ReANY(rx);
2446 SSize_t minlen; /* must match at least this many chars */
2447 SSize_t dontbother = 0; /* how many characters not to try at end */
2448 const bool utf8_target = cBOOL(DO_UTF8(sv));
2450 RXi_GET_DECL(prog,progi);
2451 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2452 regmatch_info *const reginfo = ®info_buf;
2453 regexp_paren_pair *swap = NULL;
2455 GET_RE_DEBUG_FLAGS_DECL;
2457 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2458 PERL_UNUSED_ARG(data);
2460 /* Be paranoid... */
2461 if (prog == NULL || stringarg == NULL) {
2462 Perl_croak(aTHX_ "NULL regexp parameter");
2467 debug_start_match(rx, utf8_target, stringarg, strend,
2471 startpos = stringarg;
2473 if (prog->intflags & PREGf_GPOS_SEEN) {
2476 /* set reginfo->ganch, the position where \G can match */
2479 (flags & REXEC_IGNOREPOS)
2480 ? stringarg /* use start pos rather than pos() */
2481 : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2482 /* Defined pos(): */
2483 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2484 : strbeg; /* pos() not defined; use start of string */
2486 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2487 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2489 /* in the presence of \G, we may need to start looking earlier in
2490 * the string than the suggested start point of stringarg:
2491 * if prog->gofs is set, then that's a known, fixed minimum
2494 * /ab|c\G/: gofs = 1
2495 * or if the minimum offset isn't known, then we have to go back
2496 * to the start of the string, e.g. /w+\G/
2499 if (prog->intflags & PREGf_ANCH_GPOS) {
2500 startpos = reginfo->ganch - prog->gofs;
2502 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2504 DEBUG_r(PerlIO_printf(Perl_debug_log,
2505 "fail: ganch-gofs before earliest possible start\n"));
2509 else if (prog->gofs) {
2510 if (startpos - prog->gofs < strbeg)
2513 startpos -= prog->gofs;
2515 else if (prog->intflags & PREGf_GPOS_FLOAT)
2519 minlen = prog->minlen;
2520 if ((startpos + minlen) > strend || startpos < strbeg) {
2521 DEBUG_r(PerlIO_printf(Perl_debug_log,
2522 "Regex match can't succeed, so not even tried\n"));
2526 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2527 * which will call destuctors to reset PL_regmatch_state, free higher
2528 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2529 * regmatch_info_aux_eval */
2531 oldsave = PL_savestack_ix;
2535 if ((prog->extflags & RXf_USE_INTUIT)
2536 && !(flags & REXEC_CHECKED))
2538 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2543 if (prog->extflags & RXf_CHECK_ALL) {
2544 /* we can match based purely on the result of INTUIT.
2545 * Set up captures etc just for $& and $-[0]
2546 * (an intuit-only match wont have $1,$2,..) */
2547 assert(!prog->nparens);
2549 /* s/// doesn't like it if $& is earlier than where we asked it to
2550 * start searching (which can happen on something like /.\G/) */
2551 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2554 /* this should only be possible under \G */
2555 assert(prog->intflags & PREGf_GPOS_SEEN);
2556 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2557 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2561 /* match via INTUIT shouldn't have any captures.
2562 * Let @-, @+, $^N know */
2563 prog->lastparen = prog->lastcloseparen = 0;
2564 RX_MATCH_UTF8_set(rx, utf8_target);
2565 prog->offs[0].start = s - strbeg;
2566 prog->offs[0].end = utf8_target
2567 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2568 : s - strbeg + prog->minlenret;
2569 if ( !(flags & REXEC_NOT_FIRST) )
2570 S_reg_set_capture_string(aTHX_ rx,
2572 sv, flags, utf8_target);
2578 multiline = prog->extflags & RXf_PMf_MULTILINE;
2580 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2581 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2582 "String too short [regexec_flags]...\n"));
2586 /* Check validity of program. */
2587 if (UCHARAT(progi->program) != REG_MAGIC) {
2588 Perl_croak(aTHX_ "corrupted regexp program");
2591 RX_MATCH_TAINTED_off(rx);
2593 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2594 reginfo->intuit = 0;
2595 reginfo->is_utf8_target = cBOOL(utf8_target);
2596 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2597 reginfo->warned = FALSE;
2598 reginfo->strbeg = strbeg;
2600 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2601 reginfo->strend = strend;
2602 /* see how far we have to get to not match where we matched before */
2603 reginfo->till = stringarg + minend;
2605 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2606 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2607 S_cleanup_regmatch_info_aux has executed (registered by
2608 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2609 magic belonging to this SV.
2610 Not newSVsv, either, as it does not COW.
2612 assert(!IS_PADGV(sv));
2613 reginfo->sv = newSV(0);
2614 SvSetSV_nosteal(reginfo->sv, sv);
2615 SAVEFREESV(reginfo->sv);
2618 /* reserve next 2 or 3 slots in PL_regmatch_state:
2619 * slot N+0: may currently be in use: skip it
2620 * slot N+1: use for regmatch_info_aux struct
2621 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2622 * slot N+3: ready for use by regmatch()
2626 regmatch_state *old_regmatch_state;
2627 regmatch_slab *old_regmatch_slab;
2628 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2630 /* on first ever match, allocate first slab */
2631 if (!PL_regmatch_slab) {
2632 Newx(PL_regmatch_slab, 1, regmatch_slab);
2633 PL_regmatch_slab->prev = NULL;
2634 PL_regmatch_slab->next = NULL;
2635 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
2638 old_regmatch_state = PL_regmatch_state;
2639 old_regmatch_slab = PL_regmatch_slab;
2641 for (i=0; i <= max; i++) {
2643 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
2645 reginfo->info_aux_eval =
2646 reginfo->info_aux->info_aux_eval =
2647 &(PL_regmatch_state->u.info_aux_eval);
2649 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
2650 PL_regmatch_state = S_push_slab(aTHX);
2653 /* note initial PL_regmatch_state position; at end of match we'll
2654 * pop back to there and free any higher slabs */
2656 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
2657 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
2658 reginfo->info_aux->poscache = NULL;
2660 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
2662 if ((prog->extflags & RXf_EVAL_SEEN))
2663 S_setup_eval_state(aTHX_ reginfo);
2665 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
2668 /* If there is a "must appear" string, look for it. */
2670 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
2671 /* We have to be careful. If the previous successful match
2672 was from this regex we don't want a subsequent partially
2673 successful match to clobber the old results.
2674 So when we detect this possibility we add a swap buffer
2675 to the re, and switch the buffer each match. If we fail,
2676 we switch it back; otherwise we leave it swapped.
2679 /* do we need a save destructor here for eval dies? */
2680 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
2681 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
2682 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
2689 /* Simplest case: anchored match need be tried only once. */
2690 /* [unless only anchor is BOL and multiline is set] */
2691 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
2692 if (s == startpos && regtry(reginfo, &s))
2694 else if (multiline || (prog->intflags & (PREGf_IMPLICIT | PREGf_ANCH_MBOL))) /* XXXX SBOL? */
2699 dontbother = minlen - 1;
2700 end = HOP3c(strend, -dontbother, strbeg) - 1;
2701 /* for multiline we only have to try after newlines */
2702 if (prog->check_substr || prog->check_utf8) {
2703 /* because of the goto we can not easily reuse the macros for bifurcating the
2704 unicode/non-unicode match modes here like we do elsewhere - demerphq */
2707 goto after_try_utf8;
2709 if (regtry(reginfo, &s)) {
2716 if (prog->extflags & RXf_USE_INTUIT) {
2717 s = re_intuit_start(rx, sv, strbeg,
2718 s + UTF8SKIP(s), strend, flags, NULL);
2727 } /* end search for check string in unicode */
2729 if (s == startpos) {
2730 goto after_try_latin;
2733 if (regtry(reginfo, &s)) {
2740 if (prog->extflags & RXf_USE_INTUIT) {
2741 s = re_intuit_start(rx, sv, strbeg,
2742 s + 1, strend, flags, NULL);
2751 } /* end search for check string in latin*/
2752 } /* end search for check string */
2753 else { /* search for newline */
2755 /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/
2758 /* We can use a more efficient search as newlines are the same in unicode as they are in latin */
2759 while (s <= end) { /* note it could be possible to match at the end of the string */
2760 if (*s++ == '\n') { /* don't need PL_utf8skip here */
2761 if (regtry(reginfo, &s))
2765 } /* end search for newline */
2766 } /* end anchored/multiline check string search */
2768 } else if (prog->intflags & PREGf_ANCH_GPOS)
2770 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
2771 assert(prog->intflags & PREGf_GPOS_SEEN);
2772 /* For anchored \G, the only position it can match from is
2773 * (ganch-gofs); we already set startpos to this above; if intuit
2774 * moved us on from there, we can't possibly succeed */
2775 assert(startpos == reginfo->ganch - prog->gofs);
2776 if (s == startpos && regtry(reginfo, &s))
2781 /* Messy cases: unanchored match. */
2782 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
2783 /* we have /x+whatever/ */
2784 /* it must be a one character string (XXXX Except is_utf8_pat?) */
2790 if (! prog->anchored_utf8) {
2791 to_utf8_substr(prog);
2793 ch = SvPVX_const(prog->anchored_utf8)[0];
2796 DEBUG_EXECUTE_r( did_match = 1 );
2797 if (regtry(reginfo, &s)) goto got_it;
2799 while (s < strend && *s == ch)
2806 if (! prog->anchored_substr) {
2807 if (! to_byte_substr(prog)) {
2808 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2811 ch = SvPVX_const(prog->anchored_substr)[0];
2814 DEBUG_EXECUTE_r( did_match = 1 );
2815 if (regtry(reginfo, &s)) goto got_it;
2817 while (s < strend && *s == ch)
2822 DEBUG_EXECUTE_r(if (!did_match)
2823 PerlIO_printf(Perl_debug_log,
2824 "Did not find anchored character...\n")
2827 else if (prog->anchored_substr != NULL
2828 || prog->anchored_utf8 != NULL
2829 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
2830 && prog->float_max_offset < strend - s)) {
2835 char *last1; /* Last position checked before */
2839 if (prog->anchored_substr || prog->anchored_utf8) {
2841 if (! prog->anchored_utf8) {
2842 to_utf8_substr(prog);
2844 must = prog->anchored_utf8;
2847 if (! prog->anchored_substr) {
2848 if (! to_byte_substr(prog)) {
2849 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2852 must = prog->anchored_substr;
2854 back_max = back_min = prog->anchored_offset;
2857 if (! prog->float_utf8) {
2858 to_utf8_substr(prog);
2860 must = prog->float_utf8;
2863 if (! prog->float_substr) {
2864 if (! to_byte_substr(prog)) {
2865 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2868 must = prog->float_substr;
2870 back_max = prog->float_max_offset;
2871 back_min = prog->float_min_offset;
2877 last = HOP3c(strend, /* Cannot start after this */
2878 -(SSize_t)(CHR_SVLEN(must)
2879 - (SvTAIL(must) != 0) + back_min), strbeg);
2881 if (s > reginfo->strbeg)
2882 last1 = HOPc(s, -1);
2884 last1 = s - 1; /* bogus */
2886 /* XXXX check_substr already used to find "s", can optimize if
2887 check_substr==must. */
2889 strend = HOPc(strend, -dontbother);
2890 while ( (s <= last) &&
2891 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
2892 (unsigned char*)strend, must,
2893 multiline ? FBMrf_MULTILINE : 0)) ) {
2894 DEBUG_EXECUTE_r( did_match = 1 );
2895 if (HOPc(s, -back_max) > last1) {
2896 last1 = HOPc(s, -back_min);
2897 s = HOPc(s, -back_max);
2900 char * const t = (last1 >= reginfo->strbeg)
2901 ? HOPc(last1, 1) : last1 + 1;
2903 last1 = HOPc(s, -back_min);
2907 while (s <= last1) {
2908 if (regtry(reginfo, &s))
2911 s++; /* to break out of outer loop */
2918 while (s <= last1) {
2919 if (regtry(reginfo, &s))
2925 DEBUG_EXECUTE_r(if (!did_match) {
2926 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
2927 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
2928 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
2929 ((must == prog->anchored_substr || must == prog->anchored_utf8)
2930 ? "anchored" : "floating"),
2931 quoted, RE_SV_TAIL(must));
2935 else if ( (c = progi->regstclass) ) {
2937 const OPCODE op = OP(progi->regstclass);
2938 /* don't bother with what can't match */
2939 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
2940 strend = HOPc(strend, -(minlen - 1));
2943 SV * const prop = sv_newmortal();
2944 regprop(prog, prop, c, reginfo);
2946 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
2948 PerlIO_printf(Perl_debug_log,
2949 "Matching stclass %.*s against %s (%d bytes)\n",
2950 (int)SvCUR(prop), SvPVX_const(prop),
2951 quoted, (int)(strend - s));
2954 if (find_byclass(prog, c, s, strend, reginfo))
2956 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
2960 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
2968 if (! prog->float_utf8) {
2969 to_utf8_substr(prog);
2971 float_real = prog->float_utf8;
2974 if (! prog->float_substr) {
2975 if (! to_byte_substr(prog)) {
2976 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
2979 float_real = prog->float_substr;
2982 little = SvPV_const(float_real, len);
2983 if (SvTAIL(float_real)) {
2984 /* This means that float_real contains an artificial \n on
2985 * the end due to the presence of something like this:
2986 * /foo$/ where we can match both "foo" and "foo\n" at the
2987 * end of the string. So we have to compare the end of the
2988 * string first against the float_real without the \n and
2989 * then against the full float_real with the string. We
2990 * have to watch out for cases where the string might be
2991 * smaller than the float_real or the float_real without
2993 char *checkpos= strend - len;
2995 PerlIO_printf(Perl_debug_log,
2996 "%sChecking for float_real.%s\n",
2997 PL_colors[4], PL_colors[5]));
2998 if (checkpos + 1 < strbeg) {
2999 /* can't match, even if we remove the trailing \n
3000 * string is too short to match */
3002 PerlIO_printf(Perl_debug_log,
3003 "%sString shorter than required trailing substring, cannot match.%s\n",
3004 PL_colors[4], PL_colors[5]));
3006 } else if (memEQ(checkpos + 1, little, len - 1)) {
3007 /* can match, the end of the string matches without the
3009 last = checkpos + 1;
3010 } else if (checkpos < strbeg) {
3011 /* cant match, string is too short when the "\n" is
3014 PerlIO_printf(Perl_debug_log,
3015 "%sString does not contain required trailing substring, cannot match.%s\n",
3016 PL_colors[4], PL_colors[5]));
3018 } else if (!multiline) {
3019 /* non multiline match, so compare with the "\n" at the
3020 * end of the string */
3021 if (memEQ(checkpos, little, len)) {
3025 PerlIO_printf(Perl_debug_log,
3026 "%sString does not contain required trailing substring, cannot match.%s\n",
3027 PL_colors[4], PL_colors[5]));
3031 /* multiline match, so we have to search for a place
3032 * where the full string is located */
3038 last = rninstr(s, strend, little, little + len);
3040 last = strend; /* matching "$" */
3043 /* at one point this block contained a comment which was
3044 * probably incorrect, which said that this was a "should not
3045 * happen" case. Even if it was true when it was written I am
3046 * pretty sure it is not anymore, so I have removed the comment
3047 * and replaced it with this one. Yves */
3049 PerlIO_printf(Perl_debug_log,
3050 "String does not contain required substring, cannot match.\n"
3054 dontbother = strend - last + prog->float_min_offset;
3056 if (minlen && (dontbother < minlen))
3057 dontbother = minlen - 1;
3058 strend -= dontbother; /* this one's always in bytes! */
3059 /* We don't know much -- general case. */
3062 if (regtry(reginfo, &s))
3071 if (regtry(reginfo, &s))
3073 } while (s++ < strend);
3081 /* s/// doesn't like it if $& is earlier than where we asked it to
3082 * start searching (which can happen on something like /.\G/) */
3083 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3084 && (prog->offs[0].start < stringarg - strbeg))
3086 /* this should only be possible under \G */
3087 assert(prog->intflags & PREGf_GPOS_SEEN);
3088 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3089 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3095 PerlIO_printf(Perl_debug_log,
3096 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3103 /* clean up; this will trigger destructors that will free all slabs
3104 * above the current one, and cleanup the regmatch_info_aux
3105 * and regmatch_info_aux_eval sructs */
3107 LEAVE_SCOPE(oldsave);
3109 if (RXp_PAREN_NAMES(prog))
3110 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3112 RX_MATCH_UTF8_set(rx, utf8_target);
3114 /* make sure $`, $&, $', and $digit will work later */
3115 if ( !(flags & REXEC_NOT_FIRST) )
3116 S_reg_set_capture_string(aTHX_ rx,
3117 strbeg, reginfo->strend,
3118 sv, flags, utf8_target);
3123 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3124 PL_colors[4], PL_colors[5]));
3126 /* clean up; this will trigger destructors that will free all slabs
3127 * above the current one, and cleanup the regmatch_info_aux
3128 * and regmatch_info_aux_eval sructs */
3130 LEAVE_SCOPE(oldsave);
3133 /* we failed :-( roll it back */
3134 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3135 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3140 Safefree(prog->offs);
3147 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3148 * Do inc before dec, in case old and new rex are the same */
3149 #define SET_reg_curpm(Re2) \
3150 if (reginfo->info_aux_eval) { \
3151 (void)ReREFCNT_inc(Re2); \
3152 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3153 PM_SETRE((PL_reg_curpm), (Re2)); \
3158 - regtry - try match at specific point
3160 STATIC I32 /* 0 failure, 1 success */
3161 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3165 REGEXP *const rx = reginfo->prog;
3166 regexp *const prog = ReANY(rx);
3168 RXi_GET_DECL(prog,progi);
3169 GET_RE_DEBUG_FLAGS_DECL;
3171 PERL_ARGS_ASSERT_REGTRY;
3173 reginfo->cutpoint=NULL;
3175 prog->offs[0].start = *startposp - reginfo->strbeg;
3176 prog->lastparen = 0;
3177 prog->lastcloseparen = 0;
3179 /* XXXX What this code is doing here?!!! There should be no need
3180 to do this again and again, prog->lastparen should take care of
3183 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3184 * Actually, the code in regcppop() (which Ilya may be meaning by
3185 * prog->lastparen), is not needed at all by the test suite
3186 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3187 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3188 * Meanwhile, this code *is* needed for the
3189 * above-mentioned test suite tests to succeed. The common theme
3190 * on those tests seems to be returning null fields from matches.
3191 * --jhi updated by dapm */
3193 if (prog->nparens) {
3194 regexp_paren_pair *pp = prog->offs;
3196 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3204 result = regmatch(reginfo, *startposp, progi->program + 1);
3206 prog->offs[0].end = result;
3209 if (reginfo->cutpoint)
3210 *startposp= reginfo->cutpoint;
3211 REGCP_UNWIND(lastcp);
3216 #define sayYES goto yes
3217 #define sayNO goto no
3218 #define sayNO_SILENT goto no_silent
3220 /* we dont use STMT_START/END here because it leads to
3221 "unreachable code" warnings, which are bogus, but distracting. */
3222 #define CACHEsayNO \
3223 if (ST.cache_mask) \
3224 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3227 /* this is used to determine how far from the left messages like
3228 'failed...' are printed. It should be set such that messages
3229 are inline with the regop output that created them.
3231 #define REPORT_CODE_OFF 32
3234 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3235 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3236 #define CHRTEST_NOT_A_CP_1 -999
3237 #define CHRTEST_NOT_A_CP_2 -998
3239 /* grab a new slab and return the first slot in it */
3241 STATIC regmatch_state *
3244 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3247 regmatch_slab *s = PL_regmatch_slab->next;
3249 Newx(s, 1, regmatch_slab);
3250 s->prev = PL_regmatch_slab;
3252 PL_regmatch_slab->next = s;
3254 PL_regmatch_slab = s;
3255 return SLAB_FIRST(s);
3259 /* push a new state then goto it */
3261 #define PUSH_STATE_GOTO(state, node, input) \
3262 pushinput = input; \
3264 st->resume_state = state; \
3267 /* push a new state with success backtracking, then goto it */
3269 #define PUSH_YES_STATE_GOTO(state, node, input) \
3270 pushinput = input; \
3272 st->resume_state = state; \
3273 goto push_yes_state;
3280 regmatch() - main matching routine
3282 This is basically one big switch statement in a loop. We execute an op,
3283 set 'next' to point the next op, and continue. If we come to a point which
3284 we may need to backtrack to on failure such as (A|B|C), we push a
3285 backtrack state onto the backtrack stack. On failure, we pop the top
3286 state, and re-enter the loop at the state indicated. If there are no more
3287 states to pop, we return failure.
3289 Sometimes we also need to backtrack on success; for example /A+/, where
3290 after successfully matching one A, we need to go back and try to
3291 match another one; similarly for lookahead assertions: if the assertion
3292 completes successfully, we backtrack to the state just before the assertion
3293 and then carry on. In these cases, the pushed state is marked as
3294 'backtrack on success too'. This marking is in fact done by a chain of
3295 pointers, each pointing to the previous 'yes' state. On success, we pop to
3296 the nearest yes state, discarding any intermediate failure-only states.
3297 Sometimes a yes state is pushed just to force some cleanup code to be
3298 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3299 it to free the inner regex.
3301 Note that failure backtracking rewinds the cursor position, while
3302 success backtracking leaves it alone.
3304 A pattern is complete when the END op is executed, while a subpattern
3305 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3306 ops trigger the "pop to last yes state if any, otherwise return true"
3309 A common convention in this function is to use A and B to refer to the two
3310 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3311 the subpattern to be matched possibly multiple times, while B is the entire
3312 rest of the pattern. Variable and state names reflect this convention.
3314 The states in the main switch are the union of ops and failure/success of
3315 substates associated with with that op. For example, IFMATCH is the op
3316 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3317 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3318 successfully matched A and IFMATCH_A_fail is a state saying that we have
3319 just failed to match A. Resume states always come in pairs. The backtrack
3320 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3321 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3322 on success or failure.
3324 The struct that holds a backtracking state is actually a big union, with
3325 one variant for each major type of op. The variable st points to the
3326 top-most backtrack struct. To make the code clearer, within each
3327 block of code we #define ST to alias the relevant union.
3329 Here's a concrete example of a (vastly oversimplified) IFMATCH
3335 #define ST st->u.ifmatch
3337 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3338 ST.foo = ...; // some state we wish to save
3340 // push a yes backtrack state with a resume value of
3341 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3343 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3346 case IFMATCH_A: // we have successfully executed A; now continue with B
3348 bar = ST.foo; // do something with the preserved value
3351 case IFMATCH_A_fail: // A failed, so the assertion failed
3352 ...; // do some housekeeping, then ...
3353 sayNO; // propagate the failure
3360 For any old-timers reading this who are familiar with the old recursive
3361 approach, the code above is equivalent to:
3363 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3372 ...; // do some housekeeping, then ...
3373 sayNO; // propagate the failure
3376 The topmost backtrack state, pointed to by st, is usually free. If you
3377 want to claim it, populate any ST.foo fields in it with values you wish to
3378 save, then do one of
3380 PUSH_STATE_GOTO(resume_state, node, newinput);
3381 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3383 which sets that backtrack state's resume value to 'resume_state', pushes a
3384 new free entry to the top of the backtrack stack, then goes to 'node'.
3385 On backtracking, the free slot is popped, and the saved state becomes the
3386 new free state. An ST.foo field in this new top state can be temporarily
3387 accessed to retrieve values, but once the main loop is re-entered, it
3388 becomes available for reuse.
3390 Note that the depth of the backtrack stack constantly increases during the
3391 left-to-right execution of the pattern, rather than going up and down with
3392 the pattern nesting. For example the stack is at its maximum at Z at the
3393 end of the pattern, rather than at X in the following:
3395 /(((X)+)+)+....(Y)+....Z/
3397 The only exceptions to this are lookahead/behind assertions and the cut,
3398 (?>A), which pop all the backtrack states associated with A before
3401 Backtrack state structs are allocated in slabs of about 4K in size.
3402 PL_regmatch_state and st always point to the currently active state,
3403 and PL_regmatch_slab points to the slab currently containing
3404 PL_regmatch_state. The first time regmatch() is called, the first slab is
3405 allocated, and is never freed until interpreter destruction. When the slab
3406 is full, a new one is allocated and chained to the end. At exit from
3407 regmatch(), slabs allocated since entry are freed.
3412 #define DEBUG_STATE_pp(pp) \
3414 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3415 PerlIO_printf(Perl_debug_log, \
3416 " %*s"pp" %s%s%s%s%s\n", \
3418 PL_reg_name[st->resume_state], \
3419 ((st==yes_state||st==mark_state) ? "[" : ""), \
3420 ((st==yes_state) ? "Y" : ""), \
3421 ((st==mark_state) ? "M" : ""), \
3422 ((st==yes_state||st==mark_state) ? "]" : "") \
3427 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3432 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3433 const char *start, const char *end, const char *blurb)
3435 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3437 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3442 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3443 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3445 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3446 start, end - start, 60);
3448 PerlIO_printf(Perl_debug_log,
3449 "%s%s REx%s %s against %s\n",
3450 PL_colors[4], blurb, PL_colors[5], s0, s1);
3452 if (utf8_target||utf8_pat)
3453 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3454 utf8_pat ? "pattern" : "",
3455 utf8_pat && utf8_target ? " and " : "",
3456 utf8_target ? "string" : ""
3462 S_dump_exec_pos(pTHX_ const char *locinput,
3463 const regnode *scan,
3464 const char *loc_regeol,
3465 const char *loc_bostr,
3466 const char *loc_reg_starttry,
3467 const bool utf8_target)
3469 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3470 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3471 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3472 /* The part of the string before starttry has one color
3473 (pref0_len chars), between starttry and current
3474 position another one (pref_len - pref0_len chars),
3475 after the current position the third one.
3476 We assume that pref0_len <= pref_len, otherwise we
3477 decrease pref0_len. */
3478 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3479 ? (5 + taill) - l : locinput - loc_bostr;
3482 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3484 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3486 pref0_len = pref_len - (locinput - loc_reg_starttry);
3487 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3488 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3489 ? (5 + taill) - pref_len : loc_regeol - locinput);
3490 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3494 if (pref0_len > pref_len)
3495 pref0_len = pref_len;
3497 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3499 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3500 (locinput - pref_len),pref0_len, 60, 4, 5);
3502 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3503 (locinput - pref_len + pref0_len),
3504 pref_len - pref0_len, 60, 2, 3);
3506 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3507 locinput, loc_regeol - locinput, 10, 0, 1);
3509 const STRLEN tlen=len0+len1+len2;
3510 PerlIO_printf(Perl_debug_log,
3511 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3512 (IV)(locinput - loc_bostr),
3515 (docolor ? "" : "> <"),
3517 (int)(tlen > 19 ? 0 : 19 - tlen),
3524 /* reg_check_named_buff_matched()
3525 * Checks to see if a named buffer has matched. The data array of
3526 * buffer numbers corresponding to the buffer is expected to reside
3527 * in the regexp->data->data array in the slot stored in the ARG() of
3528 * node involved. Note that this routine doesn't actually care about the
3529 * name, that information is not preserved from compilation to execution.
3530 * Returns the index of the leftmost defined buffer with the given name
3531 * or 0 if non of the buffers matched.
3534 S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan)
3537 RXi_GET_DECL(rex,rexi);
3538 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3539 I32 *nums=(I32*)SvPVX(sv_dat);
3541 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3543 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3544 if ((I32)rex->lastparen >= nums[n] &&
3545 rex->offs[nums[n]].end != -1)
3555 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3556 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3558 /* This function determines if there are one or two characters that match
3559 * the first character of the passed-in EXACTish node <text_node>, and if
3560 * so, returns them in the passed-in pointers.
3562 * If it determines that no possible character in the target string can
3563 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3564 * the first character in <text_node> requires UTF-8 to represent, and the
3565 * target string isn't in UTF-8.)
3567 * If there are more than two characters that could match the beginning of
3568 * <text_node>, or if more context is required to determine a match or not,
3569 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3571 * The motiviation behind this function is to allow the caller to set up
3572 * tight loops for matching. If <text_node> is of type EXACT, there is
3573 * only one possible character that can match its first character, and so
3574 * the situation is quite simple. But things get much more complicated if
3575 * folding is involved. It may be that the first character of an EXACTFish
3576 * node doesn't participate in any possible fold, e.g., punctuation, so it
3577 * can be matched only by itself. The vast majority of characters that are
3578 * in folds match just two things, their lower and upper-case equivalents.
3579 * But not all are like that; some have multiple possible matches, or match
3580 * sequences of more than one character. This function sorts all that out.
3582 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3583 * loop of trying to match A*, we know we can't exit where the thing
3584 * following it isn't a B. And something can't be a B unless it is the
3585 * beginning of B. By putting a quick test for that beginning in a tight
3586 * loop, we can rule out things that can't possibly be B without having to
3587 * break out of the loop, thus avoiding work. Similarly, if A is a single
3588 * character, we can make a tight loop matching A*, using the outputs of
3591 * If the target string to match isn't in UTF-8, and there aren't
3592 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3593 * the one or two possible octets (which are characters in this situation)
3594 * that can match. In all cases, if there is only one character that can
3595 * match, *<c1p> and *<c2p> will be identical.
3597 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3598 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3599 * can match the beginning of <text_node>. They should be declared with at
3600 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3601 * undefined what these contain.) If one or both of the buffers are
3602 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3603 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3604 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3605 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3606 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3608 const bool utf8_target = reginfo->is_utf8_target;
3610 UV c1 = CHRTEST_NOT_A_CP_1;
3611 UV c2 = CHRTEST_NOT_A_CP_2;
3612 bool use_chrtest_void = FALSE;
3613 const bool is_utf8_pat = reginfo->is_utf8_pat;
3615 /* Used when we have both utf8 input and utf8 output, to avoid converting
3616 * to/from code points */
3617 bool utf8_has_been_setup = FALSE;
3621 U8 *pat = (U8*)STRING(text_node);
3622 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3624 if (OP(text_node) == EXACT) {
3626 /* In an exact node, only one thing can be matched, that first
3627 * character. If both the pat and the target are UTF-8, we can just
3628 * copy the input to the output, avoiding finding the code point of
3633 else if (utf8_target) {
3634 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3635 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3636 utf8_has_been_setup = TRUE;
3639 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3642 else { /* an EXACTFish node */
3643 U8 *pat_end = pat + STR_LEN(text_node);
3645 /* An EXACTFL node has at least some characters unfolded, because what
3646 * they match is not known until now. So, now is the time to fold
3647 * the first few of them, as many as are needed to determine 'c1' and
3648 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3649 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3650 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3651 * need to fold as many characters as a single character can fold to,
3652 * so that later we can check if the first ones are such a multi-char
3653 * fold. But, in such a pattern only locale-problematic characters
3654 * aren't folded, so we can skip this completely if the first character
3655 * in the node isn't one of the tricky ones */
3656 if (OP(text_node) == EXACTFL) {
3658 if (! is_utf8_pat) {
3659 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3661 folded[0] = folded[1] = 's';
3663 pat_end = folded + 2;
3666 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
3671 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
3673 *(d++) = (U8) toFOLD_LC(*s);
3678 _to_utf8_fold_flags(s,
3681 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
3692 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
3693 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
3695 /* Multi-character folds require more context to sort out. Also
3696 * PL_utf8_foldclosures used below doesn't handle them, so have to
3697 * be handled outside this routine */
3698 use_chrtest_void = TRUE;
3700 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
3701 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
3703 /* Load the folds hash, if not already done */
3705 if (! PL_utf8_foldclosures) {
3706 if (! PL_utf8_tofold) {
3707 U8 dummy[UTF8_MAXBYTES_CASE+1];
3709 /* Force loading this by folding an above-Latin1 char */
3710 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
3711 assert(PL_utf8_tofold); /* Verify that worked */
3713 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
3716 /* The fold closures data structure is a hash with the keys
3717 * being the UTF-8 of every character that is folded to, like
3718 * 'k', and the values each an array of all code points that
3719 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
3720 * Multi-character folds are not included */
3721 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
3726 /* Not found in the hash, therefore there are no folds
3727 * containing it, so there is only a single character that
3731 else { /* Does participate in folds */
3732 AV* list = (AV*) *listp;
3733 if (av_tindex(list) != 1) {
3735 /* If there aren't exactly two folds to this, it is
3736 * outside the scope of this function */
3737 use_chrtest_void = TRUE;
3739 else { /* There are two. Get them */
3740 SV** c_p = av_fetch(list, 0, FALSE);
3742 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3746 c_p = av_fetch(list, 1, FALSE);
3748 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
3752 /* Folds that cross the 255/256 boundary are forbidden
3753 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
3754 * one is ASCIII. Since the pattern character is above
3755 * 256, and its only other match is below 256, the only
3756 * legal match will be to itself. We have thrown away
3757 * the original, so have to compute which is the one
3759 if ((c1 < 256) != (c2 < 256)) {
3760 if ((OP(text_node) == EXACTFL
3761 && ! IN_UTF8_CTYPE_LOCALE)
3762 || ((OP(text_node) == EXACTFA
3763 || OP(text_node) == EXACTFA_NO_TRIE)
3764 && (isASCII(c1) || isASCII(c2))))
3777 else /* Here, c1 is < 255 */
3779 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
3780 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
3781 && ((OP(text_node) != EXACTFA
3782 && OP(text_node) != EXACTFA_NO_TRIE)
3785 /* Here, there could be something above Latin1 in the target
3786 * which folds to this character in the pattern. All such
3787 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
3788 * than two characters involved in their folds, so are outside
3789 * the scope of this function */
3790 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
3791 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
3794 use_chrtest_void = TRUE;
3797 else { /* Here nothing above Latin1 can fold to the pattern
3799 switch (OP(text_node)) {
3801 case EXACTFL: /* /l rules */
3802 c2 = PL_fold_locale[c1];
3805 case EXACTF: /* This node only generated for non-utf8
3807 assert(! is_utf8_pat);
3808 if (! utf8_target) { /* /d rules */
3813 /* /u rules for all these. This happens to work for
3814 * EXACTFA as nothing in Latin1 folds to ASCII */
3815 case EXACTFA_NO_TRIE: /* This node only generated for
3816 non-utf8 patterns */
3817 assert(! is_utf8_pat);
3822 c2 = PL_fold_latin1[c1];
3826 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
3827 assert(0); /* NOTREACHED */
3833 /* Here have figured things out. Set up the returns */
3834 if (use_chrtest_void) {
3835 *c2p = *c1p = CHRTEST_VOID;
3837 else if (utf8_target) {
3838 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
3839 uvchr_to_utf8(c1_utf8, c1);
3840 uvchr_to_utf8(c2_utf8, c2);
3843 /* Invariants are stored in both the utf8 and byte outputs; Use
3844 * negative numbers otherwise for the byte ones. Make sure that the
3845 * byte ones are the same iff the utf8 ones are the same */
3846 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
3847 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
3850 ? CHRTEST_NOT_A_CP_1
3851 : CHRTEST_NOT_A_CP_2;
3853 else if (c1 > 255) {
3854 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
3859 *c1p = *c2p = c2; /* c2 is the only representable value */
3861 else { /* c1 is representable; see about c2 */
3863 *c2p = (c2 < 256) ? c2 : c1;
3869 /* returns -1 on failure, $+[0] on success */
3871 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
3873 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3877 const bool utf8_target = reginfo->is_utf8_target;
3878 const U32 uniflags = UTF8_ALLOW_DEFAULT;
3879 REGEXP *rex_sv = reginfo->prog;
3880 regexp *rex = ReANY(rex_sv);
3881 RXi_GET_DECL(rex,rexi);
3882 /* the current state. This is a cached copy of PL_regmatch_state */
3884 /* cache heavy used fields of st in registers */
3887 U32 n = 0; /* general value; init to avoid compiler warning */
3888 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
3889 char *locinput = startpos;
3890 char *pushinput; /* where to continue after a PUSH */
3891 I32 nextchr; /* is always set to UCHARAT(locinput) */
3893 bool result = 0; /* return value of S_regmatch */
3894 int depth = 0; /* depth of backtrack stack */
3895 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
3896 const U32 max_nochange_depth =
3897 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
3898 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
3899 regmatch_state *yes_state = NULL; /* state to pop to on success of
3901 /* mark_state piggy backs on the yes_state logic so that when we unwind
3902 the stack on success we can update the mark_state as we go */
3903 regmatch_state *mark_state = NULL; /* last mark state we have seen */
3904 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
3905 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
3907 bool no_final = 0; /* prevent failure from backtracking? */
3908 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
3909 char *startpoint = locinput;
3910 SV *popmark = NULL; /* are we looking for a mark? */
3911 SV *sv_commit = NULL; /* last mark name seen in failure */
3912 SV *sv_yes_mark = NULL; /* last mark name we have seen
3913 during a successful match */
3914 U32 lastopen = 0; /* last open we saw */
3915 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
3916 SV* const oreplsv = GvSVn(PL_replgv);
3917 /* these three flags are set by various ops to signal information to
3918 * the very next op. They have a useful lifetime of exactly one loop
3919 * iteration, and are not preserved or restored by state pushes/pops
3921 bool sw = 0; /* the condition value in (?(cond)a|b) */
3922 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
3923 int logical = 0; /* the following EVAL is:
3927 or the following IFMATCH/UNLESSM is:
3928 false: plain (?=foo)
3929 true: used as a condition: (?(?=foo))
3931 PAD* last_pad = NULL;
3933 I32 gimme = G_SCALAR;
3934 CV *caller_cv = NULL; /* who called us */
3935 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
3936 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
3937 U32 maxopenparen = 0; /* max '(' index seen so far */
3938 int to_complement; /* Invert the result? */
3939 _char_class_number classnum;
3940 bool is_utf8_pat = reginfo->is_utf8_pat;
3943 GET_RE_DEBUG_FLAGS_DECL;
3946 /* protect against undef(*^R) */
3947 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
3949 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
3950 multicall_oldcatch = 0;
3951 multicall_cv = NULL;
3953 PERL_UNUSED_VAR(multicall_cop);
3954 PERL_UNUSED_VAR(newsp);
3957 PERL_ARGS_ASSERT_REGMATCH;
3959 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
3960 PerlIO_printf(Perl_debug_log,"regmatch start\n");
3963 st = PL_regmatch_state;
3965 /* Note that nextchr is a byte even in UTF */
3968 while (scan != NULL) {
3971 SV * const prop = sv_newmortal();
3972 regnode *rnext=regnext(scan);
3973 DUMP_EXEC_POS( locinput, scan, utf8_target );
3974 regprop(rex, prop, scan, reginfo);
3976 PerlIO_printf(Perl_debug_log,
3977 "%3"IVdf":%*s%s(%"IVdf")\n",
3978 (IV)(scan - rexi->program), depth*2, "",
3980 (PL_regkind[OP(scan)] == END || !rnext) ?
3981 0 : (IV)(rnext - rexi->program));
3984 next = scan + NEXT_OFF(scan);
3987 state_num = OP(scan);
3989 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
3994 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
3996 switch (state_num) {
3997 case BOL: /* /^../ */
3998 case SBOL: /* /^../s */
3999 if (locinput == reginfo->strbeg)
4003 case MBOL: /* /^../m */
4004 if (locinput == reginfo->strbeg ||
4005 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4012 if (locinput == reginfo->ganch)
4016 case KEEPS: /* \K */
4017 /* update the startpoint */
4018 st->u.keeper.val = rex->offs[0].start;
4019 rex->offs[0].start = locinput - reginfo->strbeg;
4020 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4021 assert(0); /*NOTREACHED*/
4022 case KEEPS_next_fail:
4023 /* rollback the start point change */
4024 rex->offs[0].start = st->u.keeper.val;
4026 assert(0); /*NOTREACHED*/
4028 case MEOL: /* /..$/m */
4029 if (!NEXTCHR_IS_EOS && nextchr != '\n')
4033 case EOL: /* /..$/ */
4035 case SEOL: /* /..$/s */
4036 if (!NEXTCHR_IS_EOS && nextchr != '\n')
4038 if (reginfo->strend - locinput > 1)
4043 if (!NEXTCHR_IS_EOS)
4047 case SANY: /* /./s */
4050 goto increment_locinput;
4058 case REG_ANY: /* /./ */
4059 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
4061 goto increment_locinput;
4065 #define ST st->u.trie
4066 case TRIEC: /* (ab|cd) with known charclass */
4067 /* In this case the charclass data is available inline so
4068 we can fail fast without a lot of extra overhead.
4070 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
4072 PerlIO_printf(Perl_debug_log,
4073 "%*s %sfailed to match trie start class...%s\n",
4074 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4077 assert(0); /* NOTREACHED */
4080 case TRIE: /* (ab|cd) */
4081 /* the basic plan of execution of the trie is:
4082 * At the beginning, run though all the states, and
4083 * find the longest-matching word. Also remember the position
4084 * of the shortest matching word. For example, this pattern:
4087 * when matched against the string "abcde", will generate
4088 * accept states for all words except 3, with the longest
4089 * matching word being 4, and the shortest being 2 (with
4090 * the position being after char 1 of the string).
4092 * Then for each matching word, in word order (i.e. 1,2,4,5),
4093 * we run the remainder of the pattern; on each try setting
4094 * the current position to the character following the word,
4095 * returning to try the next word on failure.
4097 * We avoid having to build a list of words at runtime by
4098 * using a compile-time structure, wordinfo[].prev, which
4099 * gives, for each word, the previous accepting word (if any).
4100 * In the case above it would contain the mappings 1->2, 2->0,
4101 * 3->0, 4->5, 5->1. We can use this table to generate, from
4102 * the longest word (4 above), a list of all words, by
4103 * following the list of prev pointers; this gives us the
4104 * unordered list 4,5,1,2. Then given the current word we have
4105 * just tried, we can go through the list and find the
4106 * next-biggest word to try (so if we just failed on word 2,
4107 * the next in the list is 4).
4109 * Since at runtime we don't record the matching position in
4110 * the string for each word, we have to work that out for
4111 * each word we're about to process. The wordinfo table holds
4112 * the character length of each word; given that we recorded
4113 * at the start: the position of the shortest word and its
4114 * length in chars, we just need to move the pointer the
4115 * difference between the two char lengths. Depending on
4116 * Unicode status and folding, that's cheap or expensive.
4118 * This algorithm is optimised for the case where are only a
4119 * small number of accept states, i.e. 0,1, or maybe 2.
4120 * With lots of accepts states, and having to try all of them,
4121 * it becomes quadratic on number of accept states to find all
4126 /* what type of TRIE am I? (utf8 makes this contextual) */
4127 DECL_TRIE_TYPE(scan);
4129 /* what trie are we using right now */
4130 reg_trie_data * const trie
4131 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
4132 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
4133 U32 state = trie->startstate;
4136 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
4138 if (trie->states[ state ].wordnum) {
4140 PerlIO_printf(Perl_debug_log,
4141 "%*s %smatched empty string...%s\n",
4142 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4148 PerlIO_printf(Perl_debug_log,
4149 "%*s %sfailed to match trie start class...%s\n",
4150 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
4157 U8 *uc = ( U8* )locinput;
4161 U8 *uscan = (U8*)NULL;
4162 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
4163 U32 charcount = 0; /* how many input chars we have matched */
4164 U32 accepted = 0; /* have we seen any accepting states? */
4166 ST.jump = trie->jump;
4169 ST.longfold = FALSE; /* char longer if folded => it's harder */
4172 /* fully traverse the TRIE; note the position of the
4173 shortest accept state and the wordnum of the longest
4176 while ( state && uc <= (U8*)(reginfo->strend) ) {
4177 U32 base = trie->states[ state ].trans.base;
4181 wordnum = trie->states[ state ].wordnum;
4183 if (wordnum) { /* it's an accept state */
4186 /* record first match position */
4188 ST.firstpos = (U8*)locinput;
4193 ST.firstchars = charcount;
4196 if (!ST.nextword || wordnum < ST.nextword)
4197 ST.nextword = wordnum;
4198 ST.topword = wordnum;
4201 DEBUG_TRIE_EXECUTE_r({
4202 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
4203 PerlIO_printf( Perl_debug_log,
4204 "%*s %sState: %4"UVxf" Accepted: %c ",
4205 2+depth * 2, "", PL_colors[4],
4206 (UV)state, (accepted ? 'Y' : 'N'));
4209 /* read a char and goto next state */
4210 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
4212 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
4213 uscan, len, uvc, charid, foldlen,
4220 base + charid - 1 - trie->uniquecharcount)) >= 0)
4222 && ((U32)offset < trie->lasttrans)
4223 && trie->trans[offset].check == state)
4225 state = trie->trans[offset].next;
4236 DEBUG_TRIE_EXECUTE_r(
4237 PerlIO_printf( Perl_debug_log,
4238 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
4239 charid, uvc, (UV)state, PL_colors[5] );
4245 /* calculate total number of accept states */
4250 w = trie->wordinfo[w].prev;
4253 ST.accepted = accepted;
4257 PerlIO_printf( Perl_debug_log,
4258 "%*s %sgot %"IVdf" possible matches%s\n",
4259 REPORT_CODE_OFF + depth * 2, "",
4260 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
4262 goto trie_first_try; /* jump into the fail handler */
4264 assert(0); /* NOTREACHED */
4266 case TRIE_next_fail: /* we failed - try next alternative */
4270 REGCP_UNWIND(ST.cp);
4271 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
4273 if (!--ST.accepted) {
4275 PerlIO_printf( Perl_debug_log,
4276 "%*s %sTRIE failed...%s\n",
4277 REPORT_CODE_OFF+depth*2, "",
4284 /* Find next-highest word to process. Note that this code
4285 * is O(N^2) per trie run (O(N) per branch), so keep tight */
4288 U16 const nextword = ST.nextword;
4289 reg_trie_wordinfo * const wordinfo
4290 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
4291 for (word=ST.topword; word; word=wordinfo[word].prev) {
4292 if (word > nextword && (!min || word < min))
4305 ST.lastparen = rex->lastparen;
4306 ST.lastcloseparen = rex->lastcloseparen;
4310 /* find start char of end of current word */
4312 U32 chars; /* how many chars to skip */
4313 reg_trie_data * const trie
4314 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
4316 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
4318 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
4323 /* the hard option - fold each char in turn and find
4324 * its folded length (which may be different */
4325 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
4333 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
4341 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
4346 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
4362 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
4363 ? ST.jump[ST.nextword]
4367 PerlIO_printf( Perl_debug_log,
4368 "%*s %sTRIE matched word #%d, continuing%s\n",
4369 REPORT_CODE_OFF+depth*2, "",
4376 if (ST.accepted > 1 || has_cutgroup) {
4377 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
4378 assert(0); /* NOTREACHED */
4380 /* only one choice left - just continue */
4382 AV *const trie_words
4383 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
4384 SV ** const tmp = av_fetch( trie_words,
4386 SV *sv= tmp ? sv_newmortal() : NULL;
4388 PerlIO_printf( Perl_debug_log,
4389 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
4390 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
4392 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
4393 PL_colors[0], PL_colors[1],
4394 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
4396 : "not compiled under -Dr",
4400 locinput = (char*)uc;
4401 continue; /* execute rest of RE */
4402 assert(0); /* NOTREACHED */
4406 case EXACT: { /* /abc/ */
4407 char *s = STRING(scan);
4409 if (utf8_target != is_utf8_pat) {
4410 /* The target and the pattern have differing utf8ness. */
4412 const char * const e = s + ln;
4415 /* The target is utf8, the pattern is not utf8.
4416 * Above-Latin1 code points can't match the pattern;
4417 * invariants match exactly, and the other Latin1 ones need
4418 * to be downgraded to a single byte in order to do the
4419 * comparison. (If we could be confident that the target
4420 * is not malformed, this could be refactored to have fewer
4421 * tests by just assuming that if the first bytes match, it
4422 * is an invariant, but there are tests in the test suite
4423 * dealing with (??{...}) which violate this) */
4425 if (l >= reginfo->strend
4426 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
4430 if (UTF8_IS_INVARIANT(*(U8*)l)) {
4437 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
4447 /* The target is not utf8, the pattern is utf8. */
4449 if (l >= reginfo->strend
4450 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
4454 if (UTF8_IS_INVARIANT(*(U8*)s)) {
4461 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
4473 /* The target and the pattern have the same utf8ness. */
4474 /* Inline the first character, for speed. */
4475 if (reginfo->strend - locinput < ln
4476 || UCHARAT(s) != nextchr
4477 || (ln > 1 && memNE(s, locinput, ln)))
4486 case EXACTFL: { /* /abc/il */
4488 const U8 * fold_array;
4490 U32 fold_utf8_flags;
4492 folder = foldEQ_locale;
4493 fold_array = PL_fold_locale;
4494 fold_utf8_flags = FOLDEQ_LOCALE;
4497 case EXACTFU_SS: /* /\x{df}/iu */
4498 case EXACTFU: /* /abc/iu */
4499 folder = foldEQ_latin1;
4500 fold_array = PL_fold_latin1;
4501 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
4504 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
4506 assert(! is_utf8_pat);
4508 case EXACTFA: /* /abc/iaa */
4509 folder = foldEQ_latin1;
4510 fold_array = PL_fold_latin1;
4511 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
4514 case EXACTF: /* /abc/i This node only generated for
4515 non-utf8 patterns */
4516 assert(! is_utf8_pat);
4518 fold_array = PL_fold;
4519 fold_utf8_flags = 0;
4527 || state_num == EXACTFU_SS
4528 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
4530 /* Either target or the pattern are utf8, or has the issue where
4531 * the fold lengths may differ. */
4532 const char * const l = locinput;
4533 char *e = reginfo->strend;
4535 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
4536 l, &e, 0, utf8_target, fold_utf8_flags))
4544 /* Neither the target nor the pattern are utf8 */
4545 if (UCHARAT(s) != nextchr
4547 && UCHARAT(s) != fold_array[nextchr])
4551 if (reginfo->strend - locinput < ln)
4553 if (ln > 1 && ! folder(s, locinput, ln))
4559 /* XXX Could improve efficiency by separating these all out using a
4560 * macro or in-line function. At that point regcomp.c would no longer
4561 * have to set the FLAGS fields of these */
4562 case BOUNDL: /* /\b/l */
4563 case NBOUNDL: /* /\B/l */
4564 case BOUND: /* /\b/ */
4565 case BOUNDU: /* /\b/u */
4566 case BOUNDA: /* /\b/a */
4567 case NBOUND: /* /\B/ */
4568 case NBOUNDU: /* /\B/u */
4569 case NBOUNDA: /* /\B/a */
4570 /* was last char in word? */
4572 && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
4573 && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
4575 if (locinput == reginfo->strbeg)
4578 const U8 * const r =
4579 reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
4581 ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
4584 if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
4585 ln = isWORDCHAR_uni(ln);
4589 LOAD_UTF8_CHARCLASS_ALNUM();
4590 n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
4595 ln = isWORDCHAR_LC_uvchr(ln);
4596 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
4601 /* Here the string isn't utf8, or is utf8 and only ascii
4602 * characters are to match \w. In the latter case looking at
4603 * the byte just prior to the current one may be just the final
4604 * byte of a multi-byte character. This is ok. There are two
4606 * 1) it is a single byte character, and then the test is doing
4607 * just what it's supposed to.
4608 * 2) it is a multi-byte character, in which case the final
4609 * byte is never mistakable for ASCII, and so the test
4610 * will say it is not a word character, which is the
4611 * correct answer. */
4612 ln = (locinput != reginfo->strbeg) ?
4613 UCHARAT(locinput - 1) : '\n';
4614 switch (FLAGS(scan)) {
4615 case REGEX_UNICODE_CHARSET:
4616 ln = isWORDCHAR_L1(ln);
4617 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
4619 case REGEX_LOCALE_CHARSET:
4620 ln = isWORDCHAR_LC(ln);
4621 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
4623 case REGEX_DEPENDS_CHARSET:
4624 ln = isWORDCHAR(ln);
4625 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
4627 case REGEX_ASCII_RESTRICTED_CHARSET:
4628 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
4629 ln = isWORDCHAR_A(ln);
4630 n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
4633 Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
4637 /* Note requires that all BOUNDs be lower than all NBOUNDs in
4639 if (((!ln) == (!n)) == (OP(scan) < NBOUND))
4643 case ANYOF: /* /[abc]/ */
4647 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
4650 locinput += UTF8SKIP(locinput);
4653 if (!REGINCLASS(rex, scan, (U8*)locinput))
4659 /* The argument (FLAGS) to all the POSIX node types is the class number
4662 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
4666 case POSIXL: /* \w or [:punct:] etc. under /l */
4670 /* Use isFOO_lc() for characters within Latin1. (Note that
4671 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4672 * wouldn't be invariant) */
4673 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4674 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
4678 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4679 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
4680 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4681 *(locinput + 1))))))
4686 else { /* Here, must be an above Latin-1 code point */
4687 goto utf8_posix_not_eos;
4690 /* Here, must be utf8 */
4691 locinput += UTF8SKIP(locinput);
4694 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
4698 case POSIXD: /* \w or [:punct:] etc. under /d */
4704 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
4706 if (NEXTCHR_IS_EOS) {
4710 /* All UTF-8 variants match */
4711 if (! UTF8_IS_INVARIANT(nextchr)) {
4712 goto increment_locinput;
4718 case POSIXA: /* \w or [:punct:] etc. under /a */
4721 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
4722 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
4723 * character is a single byte */
4726 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
4732 /* Here we are either not in utf8, or we matched a utf8-invariant,
4733 * so the next char is the next byte */
4737 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
4741 case POSIXU: /* \w or [:punct:] etc. under /u */
4743 if (NEXTCHR_IS_EOS) {
4748 /* Use _generic_isCC() for characters within Latin1. (Note that
4749 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
4750 * wouldn't be invariant) */
4751 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
4752 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
4759 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
4760 if (! (to_complement
4761 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
4769 else { /* Handle above Latin-1 code points */
4770 classnum = (_char_class_number) FLAGS(scan);
4771 if (classnum < _FIRST_NON_SWASH_CC) {
4773 /* Here, uses a swash to find such code points. Load if if
4774 * not done already */
4775 if (! PL_utf8_swash_ptrs[classnum]) {
4776 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
4777 PL_utf8_swash_ptrs[classnum]
4778 = _core_swash_init("utf8",
4781 PL_XPosix_ptrs[classnum], &flags);
4783 if (! (to_complement
4784 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
4785 (U8 *) locinput, TRUE))))
4790 else { /* Here, uses macros to find above Latin-1 code points */
4792 case _CC_ENUM_SPACE: /* XXX would require separate
4793 code if we revert the change
4794 of \v matching this */
4795 case _CC_ENUM_PSXSPC:
4796 if (! (to_complement
4797 ^ cBOOL(is_XPERLSPACE_high(locinput))))
4802 case _CC_ENUM_BLANK:
4803 if (! (to_complement
4804 ^ cBOOL(is_HORIZWS_high(locinput))))
4809 case _CC_ENUM_XDIGIT:
4810 if (! (to_complement
4811 ^ cBOOL(is_XDIGIT_high(locinput))))
4816 case _CC_ENUM_VERTSPACE:
4817 if (! (to_complement
4818 ^ cBOOL(is_VERTWS_high(locinput))))
4823 default: /* The rest, e.g. [:cntrl:], can't match
4825 if (! to_complement) {
4831 locinput += UTF8SKIP(locinput);
4835 case CLUMP: /* Match \X: logical Unicode character. This is defined as
4836 a Unicode extended Grapheme Cluster */
4837 /* From http://www.unicode.org/reports/tr29 (5.2 version). An
4838 extended Grapheme Cluster is:
4841 | Prepend* Begin Extend*
4844 Begin is: ( Special_Begin | ! Control )
4845 Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
4846 Extend is: ( Grapheme_Extend | Spacing_Mark )
4847 Control is: [ GCB_Control | CR | LF ]
4848 Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
4850 If we create a 'Regular_Begin' = Begin - Special_Begin, then
4853 Begin is ( Regular_Begin + Special Begin )
4855 It turns out that 98.4% of all Unicode code points match
4856 Regular_Begin. Doing it this way eliminates a table match in
4857 the previous implementation for almost all Unicode code points.
4859 There is a subtlety with Prepend* which showed up in testing.
4860 Note that the Begin, and only the Begin is required in:
4861 | Prepend* Begin Extend*
4862 Also, Begin contains '! Control'. A Prepend must be a
4863 '! Control', which means it must also be a Begin. What it
4864 comes down to is that if we match Prepend* and then find no
4865 suitable Begin afterwards, that if we backtrack the last
4866 Prepend, that one will be a suitable Begin.
4871 if (! utf8_target) {
4873 /* Match either CR LF or '.', as all the other possibilities
4875 locinput++; /* Match the . or CR */
4876 if (nextchr == '\r' /* And if it was CR, and the next is LF,
4878 && locinput < reginfo->strend
4879 && UCHARAT(locinput) == '\n')
4886 /* Utf8: See if is ( CR LF ); already know that locinput <
4887 * reginfo->strend, so locinput+1 is in bounds */
4888 if ( nextchr == '\r' && locinput+1 < reginfo->strend
4889 && UCHARAT(locinput + 1) == '\n')
4896 /* In case have to backtrack to beginning, then match '.' */
4897 char *starting = locinput;
4899 /* In case have to backtrack the last prepend */
4900 char *previous_prepend = NULL;
4902 LOAD_UTF8_CHARCLASS_GCB();
4904 /* Match (prepend)* */
4905 while (locinput < reginfo->strend
4906 && (len = is_GCB_Prepend_utf8(locinput)))
4908 previous_prepend = locinput;
4912 /* As noted above, if we matched a prepend character, but
4913 * the next thing won't match, back off the last prepend we
4914 * matched, as it is guaranteed to match the begin */
4915 if (previous_prepend
4916 && (locinput >= reginfo->strend
4917 || (! swash_fetch(PL_utf8_X_regular_begin,
4918 (U8*)locinput, utf8_target)
4919 && ! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)))
4922 locinput = previous_prepend;
4925 /* Note that here we know reginfo->strend > locinput, as we
4926 * tested that upon input to this switch case, and if we
4927 * moved locinput forward, we tested the result just above
4928 * and it either passed, or we backed off so that it will
4930 if (swash_fetch(PL_utf8_X_regular_begin,
4931 (U8*)locinput, utf8_target)) {
4932 locinput += UTF8SKIP(locinput);
4934 else if (! is_GCB_SPECIAL_BEGIN_START_utf8(locinput)) {
4936 /* Here did not match the required 'Begin' in the
4937 * second term. So just match the very first
4938 * character, the '.' of the final term of the regex */
4939 locinput = starting + UTF8SKIP(starting);
4943 /* Here is a special begin. It can be composed of
4944 * several individual characters. One possibility is
4946 if ((len = is_GCB_RI_utf8(locinput))) {
4948 while (locinput < reginfo->strend
4949 && (len = is_GCB_RI_utf8(locinput)))
4953 } else if ((len = is_GCB_T_utf8(locinput))) {
4954 /* Another possibility is T+ */
4956 while (locinput < reginfo->strend
4957 && (len = is_GCB_T_utf8(locinput)))
4963 /* Here, neither RI+ nor T+; must be some other
4964 * Hangul. That means it is one of the others: L,
4965 * LV, LVT or V, and matches:
4966 * L* (L | LVT T* | V * V* T* | LV V* T*) */
4969 while (locinput < reginfo->strend
4970 && (len = is_GCB_L_utf8(locinput)))
4975 /* Here, have exhausted L*. If the next character
4976 * is not an LV, LVT nor V, it means we had to have
4977 * at least one L, so matches L+ in the original
4978 * equation, we have a complete hangul syllable.
4981 if (locinput < reginfo->strend
4982 && is_GCB_LV_LVT_V_utf8(locinput))
4984 /* Otherwise keep going. Must be LV, LVT or V.
4985 * See if LVT, by first ruling out V, then LV */
4986 if (! is_GCB_V_utf8(locinput)
4987 /* All but every TCount one is LV */
4988 && (valid_utf8_to_uvchr((U8 *) locinput,
4993 locinput += UTF8SKIP(locinput);
4996 /* Must be V or LV. Take it, then match
4998 locinput += UTF8SKIP(locinput);
4999 while (locinput < reginfo->strend
5000 && (len = is_GCB_V_utf8(locinput)))
5006 /* And any of LV, LVT, or V can be followed
5008 while (locinput < reginfo->strend
5009 && (len = is_GCB_T_utf8(locinput)))
5017 /* Match any extender */
5018 while (locinput < reginfo->strend
5019 && swash_fetch(PL_utf8_X_extend,
5020 (U8*)locinput, utf8_target))
5022 locinput += UTF8SKIP(locinput);
5026 if (locinput > reginfo->strend) sayNO;
5030 case NREFFL: /* /\g{name}/il */
5031 { /* The capture buffer cases. The ones beginning with N for the
5032 named buffers just convert to the equivalent numbered and
5033 pretend they were called as the corresponding numbered buffer
5035 /* don't initialize these in the declaration, it makes C++
5040 const U8 *fold_array;
5043 folder = foldEQ_locale;
5044 fold_array = PL_fold_locale;
5046 utf8_fold_flags = FOLDEQ_LOCALE;
5049 case NREFFA: /* /\g{name}/iaa */
5050 folder = foldEQ_latin1;
5051 fold_array = PL_fold_latin1;
5053 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5056 case NREFFU: /* /\g{name}/iu */
5057 folder = foldEQ_latin1;
5058 fold_array = PL_fold_latin1;
5060 utf8_fold_flags = 0;
5063 case NREFF: /* /\g{name}/i */
5065 fold_array = PL_fold;
5067 utf8_fold_flags = 0;
5070 case NREF: /* /\g{name}/ */
5074 utf8_fold_flags = 0;
5077 /* For the named back references, find the corresponding buffer
5079 n = reg_check_named_buff_matched(rex,scan);
5084 goto do_nref_ref_common;
5086 case REFFL: /* /\1/il */
5087 folder = foldEQ_locale;
5088 fold_array = PL_fold_locale;
5089 utf8_fold_flags = FOLDEQ_LOCALE;
5092 case REFFA: /* /\1/iaa */
5093 folder = foldEQ_latin1;
5094 fold_array = PL_fold_latin1;
5095 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5098 case REFFU: /* /\1/iu */
5099 folder = foldEQ_latin1;
5100 fold_array = PL_fold_latin1;
5101 utf8_fold_flags = 0;
5104 case REFF: /* /\1/i */
5106 fold_array = PL_fold;
5107 utf8_fold_flags = 0;
5110 case REF: /* /\1/ */
5113 utf8_fold_flags = 0;
5117 n = ARG(scan); /* which paren pair */
5120 ln = rex->offs[n].start;
5121 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5122 if (rex->lastparen < n || ln == -1)
5123 sayNO; /* Do not match unless seen CLOSEn. */
5124 if (ln == rex->offs[n].end)
5127 s = reginfo->strbeg + ln;
5128 if (type != REF /* REF can do byte comparison */
5129 && (utf8_target || type == REFFU || type == REFFL))
5131 char * limit = reginfo->strend;
5133 /* This call case insensitively compares the entire buffer
5134 * at s, with the current input starting at locinput, but
5135 * not going off the end given by reginfo->strend, and
5136 * returns in <limit> upon success, how much of the
5137 * current input was matched */
5138 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
5139 locinput, &limit, 0, utf8_target, utf8_fold_flags))
5147 /* Not utf8: Inline the first character, for speed. */
5148 if (!NEXTCHR_IS_EOS &&
5149 UCHARAT(s) != nextchr &&
5151 UCHARAT(s) != fold_array[nextchr]))
5153 ln = rex->offs[n].end - ln;
5154 if (locinput + ln > reginfo->strend)
5156 if (ln > 1 && (type == REF
5157 ? memNE(s, locinput, ln)
5158 : ! folder(s, locinput, ln)))
5164 case NOTHING: /* null op; e.g. the 'nothing' following
5165 * the '*' in m{(a+|b)*}' */
5167 case TAIL: /* placeholder while compiling (A|B|C) */
5170 case BACK: /* ??? doesn't appear to be used ??? */
5174 #define ST st->u.eval
5179 regexp_internal *rei;
5180 regnode *startpoint;
5182 case GOSTART: /* (?R) */
5183 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
5184 if (cur_eval && cur_eval->locinput==locinput) {
5185 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
5186 Perl_croak(aTHX_ "Infinite recursion in regex");
5187 if ( ++nochange_depth > max_nochange_depth )
5189 "Pattern subroutine nesting without pos change"
5190 " exceeded limit in regex");
5197 if (OP(scan)==GOSUB) {
5198 startpoint = scan + ARG2L(scan);
5199 ST.close_paren = ARG(scan);
5201 startpoint = rei->program+1;
5205 /* Save all the positions seen so far. */
5206 ST.cp = regcppush(rex, 0, maxopenparen);
5207 REGCP_SET(ST.lastcp);
5209 /* and then jump to the code we share with EVAL */
5210 goto eval_recurse_doit;
5212 assert(0); /* NOTREACHED */
5214 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
5215 if (cur_eval && cur_eval->locinput==locinput) {
5216 if ( ++nochange_depth > max_nochange_depth )
5217 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
5222 /* execute the code in the {...} */
5226 OP * const oop = PL_op;
5227 COP * const ocurcop = PL_curcop;
5231 /* save *all* paren positions */
5232 regcppush(rex, 0, maxopenparen);
5233 REGCP_SET(runops_cp);
5236 caller_cv = find_runcv(NULL);
5240 if (rexi->data->what[n] == 'r') { /* code from an external qr */
5242 (REGEXP*)(rexi->data->data[n])
5245 nop = (OP*)rexi->data->data[n+1];
5247 else if (rexi->data->what[n] == 'l') { /* literal code */
5249 nop = (OP*)rexi->data->data[n];
5250 assert(CvDEPTH(newcv));
5253 /* literal with own CV */
5254 assert(rexi->data->what[n] == 'L');
5255 newcv = rex->qr_anoncv;
5256 nop = (OP*)rexi->data->data[n];
5259 /* normally if we're about to execute code from the same
5260 * CV that we used previously, we just use the existing
5261 * CX stack entry. However, its possible that in the
5262 * meantime we may have backtracked, popped from the save
5263 * stack, and undone the SAVECOMPPAD(s) associated with
5264 * PUSH_MULTICALL; in which case PL_comppad no longer
5265 * points to newcv's pad. */
5266 if (newcv != last_pushed_cv || PL_comppad != last_pad)
5268 U8 flags = (CXp_SUB_RE |
5269 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
5270 if (last_pushed_cv) {
5271 CHANGE_MULTICALL_FLAGS(newcv, flags);
5274 PUSH_MULTICALL_FLAGS(newcv, flags);
5276 last_pushed_cv = newcv;
5279 /* these assignments are just to silence compiler
5281 multicall_cop = NULL;
5284 last_pad = PL_comppad;
5286 /* the initial nextstate you would normally execute
5287 * at the start of an eval (which would cause error
5288 * messages to come from the eval), may be optimised
5289 * away from the execution path in the regex code blocks;
5290 * so manually set PL_curcop to it initially */
5292 OP *o = cUNOPx(nop)->op_first;
5293 assert(o->op_type == OP_NULL);
5294 if (o->op_targ == OP_SCOPE) {
5295 o = cUNOPo->op_first;
5298 assert(o->op_targ == OP_LEAVE);
5299 o = cUNOPo->op_first;
5300 assert(o->op_type == OP_ENTER);
5304 if (o->op_type != OP_STUB) {
5305 assert( o->op_type == OP_NEXTSTATE
5306 || o->op_type == OP_DBSTATE
5307 || (o->op_type == OP_NULL
5308 && ( o->op_targ == OP_NEXTSTATE
5309 || o->op_targ == OP_DBSTATE
5313 PL_curcop = (COP*)o;
5318 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
5319 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
5321 rex->offs[0].end = locinput - reginfo->strbeg;
5322 if (reginfo->info_aux_eval->pos_magic)
5323 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
5324 reginfo->sv, reginfo->strbeg,
5325 locinput - reginfo->strbeg);
5328 SV *sv_mrk = get_sv("REGMARK", 1);
5329 sv_setsv(sv_mrk, sv_yes_mark);
5332 /* we don't use MULTICALL here as we want to call the
5333 * first op of the block of interest, rather than the
5334 * first op of the sub */
5335 before = (IV)(SP-PL_stack_base);
5337 CALLRUNOPS(aTHX); /* Scalar context. */
5339 if ((IV)(SP-PL_stack_base) == before)
5340 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
5346 /* before restoring everything, evaluate the returned
5347 * value, so that 'uninit' warnings don't use the wrong
5348 * PL_op or pad. Also need to process any magic vars
5349 * (e.g. $1) *before* parentheses are restored */
5354 if (logical == 0) /* (?{})/ */
5355 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
5356 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
5357 sw = cBOOL(SvTRUE(ret));
5360 else { /* /(??{}) */
5361 /* if its overloaded, let the regex compiler handle
5362 * it; otherwise extract regex, or stringify */
5363 if (SvGMAGICAL(ret))
5364 ret = sv_mortalcopy(ret);
5365 if (!SvAMAGIC(ret)) {
5369 if (SvTYPE(sv) == SVt_REGEXP)
5370 re_sv = (REGEXP*) sv;
5371 else if (SvSMAGICAL(ret)) {
5372 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
5374 re_sv = (REGEXP *) mg->mg_obj;
5377 /* force any undef warnings here */
5378 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
5379 ret = sv_mortalcopy(ret);
5380 (void) SvPV_force_nolen(ret);
5386 /* *** Note that at this point we don't restore
5387 * PL_comppad, (or pop the CxSUB) on the assumption it may
5388 * be used again soon. This is safe as long as nothing
5389 * in the regexp code uses the pad ! */
5391 PL_curcop = ocurcop;
5392 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
5393 PL_curpm = PL_reg_curpm;
5399 /* only /(??{})/ from now on */
5402 /* extract RE object from returned value; compiling if
5406 re_sv = reg_temp_copy(NULL, re_sv);
5411 if (SvUTF8(ret) && IN_BYTES) {
5412 /* In use 'bytes': make a copy of the octet
5413 * sequence, but without the flag on */
5415 const char *const p = SvPV(ret, len);
5416 ret = newSVpvn_flags(p, len, SVs_TEMP);
5418 if (rex->intflags & PREGf_USE_RE_EVAL)
5419 pm_flags |= PMf_USE_RE_EVAL;
5421 /* if we got here, it should be an engine which
5422 * supports compiling code blocks and stuff */
5423 assert(rex->engine && rex->engine->op_comp);
5424 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
5425 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
5426 rex->engine, NULL, NULL,
5427 /* copy /msix etc to inner pattern */
5432 & (SVs_TEMP | SVs_GMG | SVf_ROK))
5433 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
5434 /* This isn't a first class regexp. Instead, it's
5435 caching a regexp onto an existing, Perl visible
5437 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
5443 RXp_MATCH_COPIED_off(re);
5444 re->subbeg = rex->subbeg;
5445 re->sublen = rex->sublen;
5446 re->suboffset = rex->suboffset;
5447 re->subcoffset = rex->subcoffset;
5449 re->lastcloseparen = 0;
5452 debug_start_match(re_sv, utf8_target, locinput,
5453 reginfo->strend, "Matching embedded");
5455 startpoint = rei->program + 1;
5456 ST.close_paren = 0; /* only used for GOSUB */
5457 /* Save all the seen positions so far. */
5458 ST.cp = regcppush(rex, 0, maxopenparen);
5459 REGCP_SET(ST.lastcp);
5460 /* and set maxopenparen to 0, since we are starting a "fresh" match */
5462 /* run the pattern returned from (??{...}) */
5464 eval_recurse_doit: /* Share code with GOSUB below this line
5465 * At this point we expect the stack context to be
5466 * set up correctly */
5468 /* invalidate the S-L poscache. We're now executing a
5469 * different set of WHILEM ops (and their associated
5470 * indexes) against the same string, so the bits in the
5471 * cache are meaningless. Setting maxiter to zero forces
5472 * the cache to be invalidated and zeroed before reuse.
5473 * XXX This is too dramatic a measure. Ideally we should
5474 * save the old cache and restore when running the outer
5476 reginfo->poscache_maxiter = 0;
5478 /* the new regexp might have a different is_utf8_pat than we do */
5479 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
5481 ST.prev_rex = rex_sv;
5482 ST.prev_curlyx = cur_curlyx;
5484 SET_reg_curpm(rex_sv);
5489 ST.prev_eval = cur_eval;
5491 /* now continue from first node in postoned RE */
5492 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
5493 assert(0); /* NOTREACHED */
5496 case EVAL_AB: /* cleanup after a successful (??{A})B */
5497 /* note: this is called twice; first after popping B, then A */
5498 rex_sv = ST.prev_rex;
5499 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5500 SET_reg_curpm(rex_sv);
5501 rex = ReANY(rex_sv);
5502 rexi = RXi_GET(rex);
5504 /* preserve $^R across LEAVE's. See Bug 121070. */
5505 SV *save_sv= GvSV(PL_replgv);
5506 SvREFCNT_inc(save_sv);
5507 regcpblow(ST.cp); /* LEAVE in disguise */
5508 sv_setsv(GvSV(PL_replgv), save_sv);
5509 SvREFCNT_dec(save_sv);
5511 cur_eval = ST.prev_eval;
5512 cur_curlyx = ST.prev_curlyx;
5514 /* Invalidate cache. See "invalidate" comment above. */
5515 reginfo->poscache_maxiter = 0;
5516 if ( nochange_depth )
5521 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
5522 /* note: this is called twice; first after popping B, then A */
5523 rex_sv = ST.prev_rex;
5524 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
5525 SET_reg_curpm(rex_sv);
5526 rex = ReANY(rex_sv);
5527 rexi = RXi_GET(rex);
5529 REGCP_UNWIND(ST.lastcp);
5530 regcppop(rex, &maxopenparen);
5531 cur_eval = ST.prev_eval;
5532 cur_curlyx = ST.prev_curlyx;
5533 /* Invalidate cache. See "invalidate" comment above. */
5534 reginfo->poscache_maxiter = 0;
5535 if ( nochange_depth )
5541 n = ARG(scan); /* which paren pair */
5542 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
5543 if (n > maxopenparen)
5545 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
5546 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
5550 (IV)rex->offs[n].start_tmp,
5556 /* XXX really need to log other places start/end are set too */
5557 #define CLOSE_CAPTURE \
5558 rex->offs[n].start = rex->offs[n].start_tmp; \
5559 rex->offs[n].end = locinput - reginfo->strbeg; \
5560 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
5561 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
5563 PTR2UV(rex->offs), \
5565 (IV)rex->offs[n].start, \
5566 (IV)rex->offs[n].end \
5570 n = ARG(scan); /* which paren pair */
5572 if (n > rex->lastparen)
5574 rex->lastcloseparen = n;
5575 if (cur_eval && cur_eval->u.eval.close_paren == n) {
5580 case ACCEPT: /* (*ACCEPT) */
5584 cursor && OP(cursor)!=END;
5585 cursor=regnext(cursor))
5587 if ( OP(cursor)==CLOSE ){
5589 if ( n <= lastopen ) {
5591 if (n > rex->lastparen)
5593 rex->lastcloseparen = n;
5594 if ( n == ARG(scan) || (cur_eval &&
5595 cur_eval->u.eval.close_paren == n))
5604 case GROUPP: /* (?(1)) */
5605 n = ARG(scan); /* which paren pair */
5606 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
5609 case NGROUPP: /* (?(<name>)) */
5610 /* reg_check_named_buff_matched returns 0 for no match */
5611 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
5614 case INSUBP: /* (?(R)) */
5616 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
5619 case DEFINEP: /* (?(DEFINE)) */
5623 case IFTHEN: /* (?(cond)A|B) */
5624 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
5626 next = NEXTOPER(NEXTOPER(scan));
5628 next = scan + ARG(scan);
5629 if (OP(next) == IFTHEN) /* Fake one. */
5630 next = NEXTOPER(NEXTOPER(next));
5634 case LOGICAL: /* modifier for EVAL and IFMATCH */
5635 logical = scan->flags;
5638 /*******************************************************************
5640 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
5641 pattern, where A and B are subpatterns. (For simple A, CURLYM or
5642 STAR/PLUS/CURLY/CURLYN are used instead.)
5644 A*B is compiled as <CURLYX><A><WHILEM><B>
5646 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
5647 state, which contains the current count, initialised to -1. It also sets
5648 cur_curlyx to point to this state, with any previous value saved in the
5651 CURLYX then jumps straight to the WHILEM op, rather than executing A,
5652 since the pattern may possibly match zero times (i.e. it's a while {} loop
5653 rather than a do {} while loop).
5655 Each entry to WHILEM represents a successful match of A. The count in the
5656 CURLYX block is incremented, another WHILEM state is pushed, and execution
5657 passes to A or B depending on greediness and the current count.
5659 For example, if matching against the string a1a2a3b (where the aN are
5660 substrings that match /A/), then the match progresses as follows: (the
5661 pushed states are interspersed with the bits of strings matched so far):
5664 <CURLYX cnt=0><WHILEM>
5665 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
5666 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
5667 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
5668 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
5670 (Contrast this with something like CURLYM, which maintains only a single
5674 a1 <CURLYM cnt=1> a2
5675 a1 a2 <CURLYM cnt=2> a3
5676 a1 a2 a3 <CURLYM cnt=3> b
5679 Each WHILEM state block marks a point to backtrack to upon partial failure
5680 of A or B, and also contains some minor state data related to that
5681 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
5682 overall state, such as the count, and pointers to the A and B ops.
5684 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
5685 must always point to the *current* CURLYX block, the rules are:
5687 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
5688 and set cur_curlyx to point the new block.
5690 When popping the CURLYX block after a successful or unsuccessful match,
5691 restore the previous cur_curlyx.
5693 When WHILEM is about to execute B, save the current cur_curlyx, and set it
5694 to the outer one saved in the CURLYX block.
5696 When popping the WHILEM block after a successful or unsuccessful B match,
5697 restore the previous cur_curlyx.
5699 Here's an example for the pattern (AI* BI)*BO
5700 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
5703 curlyx backtrack stack
5704 ------ ---------------
5706 CO <CO prev=NULL> <WO>
5707 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5708 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5709 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
5711 At this point the pattern succeeds, and we work back down the stack to
5712 clean up, restoring as we go:
5714 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
5715 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
5716 CO <CO prev=NULL> <WO>
5719 *******************************************************************/
5721 #define ST st->u.curlyx
5723 case CURLYX: /* start of /A*B/ (for complex A) */
5725 /* No need to save/restore up to this paren */
5726 I32 parenfloor = scan->flags;
5728 assert(next); /* keep Coverity happy */
5729 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
5732 /* XXXX Probably it is better to teach regpush to support
5733 parenfloor > maxopenparen ... */
5734 if (parenfloor > (I32)rex->lastparen)
5735 parenfloor = rex->lastparen; /* Pessimization... */
5737 ST.prev_curlyx= cur_curlyx;
5739 ST.cp = PL_savestack_ix;
5741 /* these fields contain the state of the current curly.
5742 * they are accessed by subsequent WHILEMs */
5743 ST.parenfloor = parenfloor;
5748 ST.count = -1; /* this will be updated by WHILEM */
5749 ST.lastloc = NULL; /* this will be updated by WHILEM */
5751 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
5752 assert(0); /* NOTREACHED */
5755 case CURLYX_end: /* just finished matching all of A*B */
5756 cur_curlyx = ST.prev_curlyx;
5758 assert(0); /* NOTREACHED */
5760 case CURLYX_end_fail: /* just failed to match all of A*B */
5762 cur_curlyx = ST.prev_curlyx;
5764 assert(0); /* NOTREACHED */
5768 #define ST st->u.whilem
5770 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
5772 /* see the discussion above about CURLYX/WHILEM */
5774 int min = ARG1(cur_curlyx->u.curlyx.me);
5775 int max = ARG2(cur_curlyx->u.curlyx.me);
5776 regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
5778 assert(cur_curlyx); /* keep Coverity happy */
5779 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
5780 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
5781 ST.cache_offset = 0;
5785 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5786 "%*s whilem: matched %ld out of %d..%d\n",
5787 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
5790 /* First just match a string of min A's. */
5793 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5795 cur_curlyx->u.curlyx.lastloc = locinput;
5796 REGCP_SET(ST.lastcp);
5798 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
5799 assert(0); /* NOTREACHED */
5802 /* If degenerate A matches "", assume A done. */
5804 if (locinput == cur_curlyx->u.curlyx.lastloc) {
5805 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5806 "%*s whilem: empty match detected, trying continuation...\n",
5807 REPORT_CODE_OFF+depth*2, "")
5809 goto do_whilem_B_max;
5812 /* super-linear cache processing.
5814 * The idea here is that for certain types of CURLYX/WHILEM -
5815 * principally those whose upper bound is infinity (and
5816 * excluding regexes that have things like \1 and other very
5817 * non-regular expresssiony things), then if a pattern like
5818 * /....A*.../ fails and we backtrack to the WHILEM, then we
5819 * make a note that this particular WHILEM op was at string
5820 * position 47 (say) when the rest of pattern failed. Then, if
5821 * we ever find ourselves back at that WHILEM, and at string
5822 * position 47 again, we can just fail immediately rather than
5823 * running the rest of the pattern again.
5825 * This is very handy when patterns start to go
5826 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
5827 * with a combinatorial explosion of backtracking.
5829 * The cache is implemented as a bit array, with one bit per
5830 * string byte position per WHILEM op (up to 16) - so its
5831 * between 0.25 and 2x the string size.
5833 * To avoid allocating a poscache buffer every time, we do an
5834 * initially countdown; only after we have executed a WHILEM
5835 * op (string-length x #WHILEMs) times do we allocate the
5838 * The top 4 bits of scan->flags byte say how many different
5839 * relevant CURLLYX/WHILEM op pairs there are, while the
5840 * bottom 4-bits is the identifying index number of this
5846 if (!reginfo->poscache_maxiter) {
5847 /* start the countdown: Postpone detection until we
5848 * know the match is not *that* much linear. */
5849 reginfo->poscache_maxiter
5850 = (reginfo->strend - reginfo->strbeg + 1)
5852 /* possible overflow for long strings and many CURLYX's */
5853 if (reginfo->poscache_maxiter < 0)
5854 reginfo->poscache_maxiter = I32_MAX;
5855 reginfo->poscache_iter = reginfo->poscache_maxiter;
5858 if (reginfo->poscache_iter-- == 0) {
5859 /* initialise cache */
5860 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
5861 regmatch_info_aux *const aux = reginfo->info_aux;
5862 if (aux->poscache) {
5863 if ((SSize_t)reginfo->poscache_size < size) {
5864 Renew(aux->poscache, size, char);
5865 reginfo->poscache_size = size;
5867 Zero(aux->poscache, size, char);
5870 reginfo->poscache_size = size;
5871 Newxz(aux->poscache, size, char);
5873 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5874 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
5875 PL_colors[4], PL_colors[5])
5879 if (reginfo->poscache_iter < 0) {
5880 /* have we already failed at this position? */
5881 SSize_t offset, mask;
5883 reginfo->poscache_iter = -1; /* stop eventual underflow */
5884 offset = (scan->flags & 0xf) - 1
5885 + (locinput - reginfo->strbeg)
5887 mask = 1 << (offset % 8);
5889 if (reginfo->info_aux->poscache[offset] & mask) {
5890 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
5891 "%*s whilem: (cache) already tried at this position...\n",
5892 REPORT_CODE_OFF+depth*2, "")
5894 sayNO; /* cache records failure */
5896 ST.cache_offset = offset;
5897 ST.cache_mask = mask;
5901 /* Prefer B over A for minimal matching. */
5903 if (cur_curlyx->u.curlyx.minmod) {
5904 ST.save_curlyx = cur_curlyx;
5905 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5906 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
5908 REGCP_SET(ST.lastcp);
5909 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
5911 assert(0); /* NOTREACHED */
5914 /* Prefer A over B for maximal matching. */
5916 if (n < max) { /* More greed allowed? */
5917 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
5919 cur_curlyx->u.curlyx.lastloc = locinput;
5920 REGCP_SET(ST.lastcp);
5921 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
5922 assert(0); /* NOTREACHED */
5924 goto do_whilem_B_max;
5926 assert(0); /* NOTREACHED */
5928 case WHILEM_B_min: /* just matched B in a minimal match */
5929 case WHILEM_B_max: /* just matched B in a maximal match */
5930 cur_curlyx = ST.save_curlyx;
5932 assert(0); /* NOTREACHED */
5934 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
5935 cur_curlyx = ST.save_curlyx;
5936 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5937 cur_curlyx->u.curlyx.count--;
5939 assert(0); /* NOTREACHED */
5941 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
5943 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
5944 REGCP_UNWIND(ST.lastcp);
5945 regcppop(rex, &maxopenparen);
5946 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
5947 cur_curlyx->u.curlyx.count--;
5949 assert(0); /* NOTREACHED */
5951 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
5952 REGCP_UNWIND(ST.lastcp);
5953 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
5954 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5955 "%*s whilem: failed, trying continuation...\n",
5956 REPORT_CODE_OFF+depth*2, "")
5959 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5960 && ckWARN(WARN_REGEXP)
5961 && !reginfo->warned)
5963 reginfo->warned = TRUE;
5964 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5965 "Complex regular subexpression recursion limit (%d) "
5971 ST.save_curlyx = cur_curlyx;
5972 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
5973 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
5975 assert(0); /* NOTREACHED */
5977 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
5978 cur_curlyx = ST.save_curlyx;
5979 REGCP_UNWIND(ST.lastcp);
5980 regcppop(rex, &maxopenparen);
5982 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
5983 /* Maximum greed exceeded */
5984 if (cur_curlyx->u.curlyx.count >= REG_INFTY
5985 && ckWARN(WARN_REGEXP)
5986 && !reginfo->warned)
5988 reginfo->warned = TRUE;
5989 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
5990 "Complex regular subexpression recursion "
5991 "limit (%d) exceeded",
5994 cur_curlyx->u.curlyx.count--;
5998 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
5999 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6001 /* Try grabbing another A and see if it helps. */
6002 cur_curlyx->u.curlyx.lastloc = locinput;
6003 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6005 REGCP_SET(ST.lastcp);
6006 PUSH_STATE_GOTO(WHILEM_A_min,
6007 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6009 assert(0); /* NOTREACHED */
6012 #define ST st->u.branch
6014 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
6015 next = scan + ARG(scan);
6018 scan = NEXTOPER(scan);
6021 case BRANCH: /* /(...|A|...)/ */
6022 scan = NEXTOPER(scan); /* scan now points to inner node */
6023 ST.lastparen = rex->lastparen;
6024 ST.lastcloseparen = rex->lastcloseparen;
6025 ST.next_branch = next;
6028 /* Now go into the branch */
6030 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
6032 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
6034 assert(0); /* NOTREACHED */
6036 case CUTGROUP: /* /(*THEN)/ */
6037 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
6038 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6039 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
6040 assert(0); /* NOTREACHED */
6042 case CUTGROUP_next_fail:
6045 if (st->u.mark.mark_name)
6046 sv_commit = st->u.mark.mark_name;
6048 assert(0); /* NOTREACHED */
6052 assert(0); /* NOTREACHED */
6054 case BRANCH_next_fail: /* that branch failed; try the next, if any */
6059 REGCP_UNWIND(ST.cp);
6060 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6061 scan = ST.next_branch;
6062 /* no more branches? */
6063 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
6065 PerlIO_printf( Perl_debug_log,
6066 "%*s %sBRANCH failed...%s\n",
6067 REPORT_CODE_OFF+depth*2, "",
6073 continue; /* execute next BRANCH[J] op */
6074 assert(0); /* NOTREACHED */
6076 case MINMOD: /* next op will be non-greedy, e.g. A*? */
6081 #define ST st->u.curlym
6083 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
6085 /* This is an optimisation of CURLYX that enables us to push
6086 * only a single backtracking state, no matter how many matches
6087 * there are in {m,n}. It relies on the pattern being constant
6088 * length, with no parens to influence future backrefs
6092 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6094 ST.lastparen = rex->lastparen;
6095 ST.lastcloseparen = rex->lastcloseparen;
6097 /* if paren positive, emulate an OPEN/CLOSE around A */
6099 U32 paren = ST.me->flags;
6100 if (paren > maxopenparen)
6101 maxopenparen = paren;
6102 scan += NEXT_OFF(scan); /* Skip former OPEN. */
6110 ST.c1 = CHRTEST_UNINIT;
6113 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
6116 curlym_do_A: /* execute the A in /A{m,n}B/ */
6117 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
6118 assert(0); /* NOTREACHED */
6120 case CURLYM_A: /* we've just matched an A */
6122 /* after first match, determine A's length: u.curlym.alen */
6123 if (ST.count == 1) {
6124 if (reginfo->is_utf8_target) {
6125 char *s = st->locinput;
6126 while (s < locinput) {
6132 ST.alen = locinput - st->locinput;
6135 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
6138 PerlIO_printf(Perl_debug_log,
6139 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
6140 (int)(REPORT_CODE_OFF+(depth*2)), "",
6141 (IV) ST.count, (IV)ST.alen)
6144 if (cur_eval && cur_eval->u.eval.close_paren &&
6145 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6149 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
6150 if ( max == REG_INFTY || ST.count < max )
6151 goto curlym_do_A; /* try to match another A */
6153 goto curlym_do_B; /* try to match B */
6155 case CURLYM_A_fail: /* just failed to match an A */
6156 REGCP_UNWIND(ST.cp);
6158 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
6159 || (cur_eval && cur_eval->u.eval.close_paren &&
6160 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
6163 curlym_do_B: /* execute the B in /A{m,n}B/ */
6164 if (ST.c1 == CHRTEST_UNINIT) {
6165 /* calculate c1 and c2 for possible match of 1st char
6166 * following curly */
6167 ST.c1 = ST.c2 = CHRTEST_VOID;
6168 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
6169 regnode *text_node = ST.B;
6170 if (! HAS_TEXT(text_node))
6171 FIND_NEXT_IMPT(text_node);
6174 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
6176 But the former is redundant in light of the latter.
6178 if this changes back then the macro for
6179 IS_TEXT and friends need to change.
6181 if (PL_regkind[OP(text_node)] == EXACT) {
6182 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6183 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6193 PerlIO_printf(Perl_debug_log,
6194 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
6195 (int)(REPORT_CODE_OFF+(depth*2)),
6198 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
6199 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
6200 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6201 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6203 /* simulate B failing */
6205 PerlIO_printf(Perl_debug_log,
6206 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
6207 (int)(REPORT_CODE_OFF+(depth*2)),"",
6208 valid_utf8_to_uvchr((U8 *) locinput, NULL),
6209 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
6210 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
6212 state_num = CURLYM_B_fail;
6213 goto reenter_switch;
6216 else if (nextchr != ST.c1 && nextchr != ST.c2) {
6217 /* simulate B failing */
6219 PerlIO_printf(Perl_debug_log,
6220 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
6221 (int)(REPORT_CODE_OFF+(depth*2)),"",
6222 (int) nextchr, ST.c1, ST.c2)
6224 state_num = CURLYM_B_fail;
6225 goto reenter_switch;
6230 /* emulate CLOSE: mark current A as captured */
6231 I32 paren = ST.me->flags;
6233 rex->offs[paren].start
6234 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
6235 rex->offs[paren].end = locinput - reginfo->strbeg;
6236 if ((U32)paren > rex->lastparen)
6237 rex->lastparen = paren;
6238 rex->lastcloseparen = paren;
6241 rex->offs[paren].end = -1;
6242 if (cur_eval && cur_eval->u.eval.close_paren &&
6243 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
6252 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
6253 assert(0); /* NOTREACHED */
6255 case CURLYM_B_fail: /* just failed to match a B */
6256 REGCP_UNWIND(ST.cp);
6257 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6259 I32 max = ARG2(ST.me);
6260 if (max != REG_INFTY && ST.count == max)
6262 goto curlym_do_A; /* try to match a further A */
6264 /* backtrack one A */
6265 if (ST.count == ARG1(ST.me) /* min */)
6268 SET_locinput(HOPc(locinput, -ST.alen));
6269 goto curlym_do_B; /* try to match B */
6272 #define ST st->u.curly
6274 #define CURLY_SETPAREN(paren, success) \
6277 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
6278 rex->offs[paren].end = locinput - reginfo->strbeg; \
6279 if (paren > rex->lastparen) \
6280 rex->lastparen = paren; \
6281 rex->lastcloseparen = paren; \
6284 rex->offs[paren].end = -1; \
6285 rex->lastparen = ST.lastparen; \
6286 rex->lastcloseparen = ST.lastcloseparen; \
6290 case STAR: /* /A*B/ where A is width 1 char */
6294 scan = NEXTOPER(scan);
6297 case PLUS: /* /A+B/ where A is width 1 char */
6301 scan = NEXTOPER(scan);
6304 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
6305 ST.paren = scan->flags; /* Which paren to set */
6306 ST.lastparen = rex->lastparen;
6307 ST.lastcloseparen = rex->lastcloseparen;
6308 if (ST.paren > maxopenparen)
6309 maxopenparen = ST.paren;
6310 ST.min = ARG1(scan); /* min to match */
6311 ST.max = ARG2(scan); /* max to match */
6312 if (cur_eval && cur_eval->u.eval.close_paren &&
6313 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6317 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
6320 case CURLY: /* /A{m,n}B/ where A is width 1 char */
6322 ST.min = ARG1(scan); /* min to match */
6323 ST.max = ARG2(scan); /* max to match */
6324 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
6327 * Lookahead to avoid useless match attempts
6328 * when we know what character comes next.
6330 * Used to only do .*x and .*?x, but now it allows
6331 * for )'s, ('s and (?{ ... })'s to be in the way
6332 * of the quantifier and the EXACT-like node. -- japhy
6335 assert(ST.min <= ST.max);
6336 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
6337 ST.c1 = ST.c2 = CHRTEST_VOID;
6340 regnode *text_node = next;
6342 if (! HAS_TEXT(text_node))
6343 FIND_NEXT_IMPT(text_node);
6345 if (! HAS_TEXT(text_node))
6346 ST.c1 = ST.c2 = CHRTEST_VOID;
6348 if ( PL_regkind[OP(text_node)] != EXACT ) {
6349 ST.c1 = ST.c2 = CHRTEST_VOID;
6353 /* Currently we only get here when
6355 PL_rekind[OP(text_node)] == EXACT
6357 if this changes back then the macro for IS_TEXT and
6358 friends need to change. */
6359 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
6360 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
6372 char *li = locinput;
6375 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
6381 if (ST.c1 == CHRTEST_VOID)
6382 goto curly_try_B_min;
6384 ST.oldloc = locinput;
6386 /* set ST.maxpos to the furthest point along the
6387 * string that could possibly match */
6388 if (ST.max == REG_INFTY) {
6389 ST.maxpos = reginfo->strend - 1;
6391 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
6394 else if (utf8_target) {
6395 int m = ST.max - ST.min;
6396 for (ST.maxpos = locinput;
6397 m >0 && ST.maxpos < reginfo->strend; m--)
6398 ST.maxpos += UTF8SKIP(ST.maxpos);
6401 ST.maxpos = locinput + ST.max - ST.min;
6402 if (ST.maxpos >= reginfo->strend)
6403 ST.maxpos = reginfo->strend - 1;
6405 goto curly_try_B_min_known;
6409 /* avoid taking address of locinput, so it can remain
6411 char *li = locinput;
6412 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
6413 if (ST.count < ST.min)
6416 if ((ST.count > ST.min)
6417 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
6419 /* A{m,n} must come at the end of the string, there's
6420 * no point in backing off ... */
6422 /* ...except that $ and \Z can match before *and* after
6423 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
6424 We may back off by one in this case. */
6425 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
6429 goto curly_try_B_max;
6431 assert(0); /* NOTREACHED */
6434 case CURLY_B_min_known_fail:
6435 /* failed to find B in a non-greedy match where c1,c2 valid */
6437 REGCP_UNWIND(ST.cp);
6439 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6441 /* Couldn't or didn't -- move forward. */
6442 ST.oldloc = locinput;
6444 locinput += UTF8SKIP(locinput);
6448 curly_try_B_min_known:
6449 /* find the next place where 'B' could work, then call B */
6453 n = (ST.oldloc == locinput) ? 0 : 1;
6454 if (ST.c1 == ST.c2) {
6455 /* set n to utf8_distance(oldloc, locinput) */
6456 while (locinput <= ST.maxpos
6457 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
6459 locinput += UTF8SKIP(locinput);
6464 /* set n to utf8_distance(oldloc, locinput) */
6465 while (locinput <= ST.maxpos
6466 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
6467 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
6469 locinput += UTF8SKIP(locinput);
6474 else { /* Not utf8_target */
6475 if (ST.c1 == ST.c2) {
6476 while (locinput <= ST.maxpos &&
6477 UCHARAT(locinput) != ST.c1)
6481 while (locinput <= ST.maxpos
6482 && UCHARAT(locinput) != ST.c1
6483 && UCHARAT(locinput) != ST.c2)
6486 n = locinput - ST.oldloc;
6488 if (locinput > ST.maxpos)
6491 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
6492 * at b; check that everything between oldloc and
6493 * locinput matches */
6494 char *li = ST.oldloc;
6496 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
6498 assert(n == REG_INFTY || locinput == li);
6500 CURLY_SETPAREN(ST.paren, ST.count);
6501 if (cur_eval && cur_eval->u.eval.close_paren &&
6502 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6505 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
6507 assert(0); /* NOTREACHED */
6510 case CURLY_B_min_fail:
6511 /* failed to find B in a non-greedy match where c1,c2 invalid */
6513 REGCP_UNWIND(ST.cp);
6515 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6517 /* failed -- move forward one */
6519 char *li = locinput;
6520 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
6527 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
6528 ST.count > 0)) /* count overflow ? */
6531 CURLY_SETPAREN(ST.paren, ST.count);
6532 if (cur_eval && cur_eval->u.eval.close_paren &&
6533 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6536 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
6540 assert(0); /* NOTREACHED */
6544 /* a successful greedy match: now try to match B */
6545 if (cur_eval && cur_eval->u.eval.close_paren &&
6546 cur_eval->u.eval.close_paren == (U32)ST.paren) {
6550 bool could_match = locinput < reginfo->strend;
6552 /* If it could work, try it. */
6553 if (ST.c1 != CHRTEST_VOID && could_match) {
6554 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
6556 could_match = memEQ(locinput,
6561 UTF8SKIP(locinput));
6564 could_match = UCHARAT(locinput) == ST.c1
6565 || UCHARAT(locinput) == ST.c2;
6568 if (ST.c1 == CHRTEST_VOID || could_match) {
6569 CURLY_SETPAREN(ST.paren, ST.count);
6570 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
6571 assert(0); /* NOTREACHED */
6576 case CURLY_B_max_fail:
6577 /* failed to find B in a greedy match */
6579 REGCP_UNWIND(ST.cp);
6581 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
6584 if (--ST.count < ST.min)
6586 locinput = HOPc(locinput, -1);
6587 goto curly_try_B_max;
6591 case END: /* last op of main pattern */
6594 /* we've just finished A in /(??{A})B/; now continue with B */
6596 st->u.eval.prev_rex = rex_sv; /* inner */
6598 /* Save *all* the positions. */
6599 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
6600 rex_sv = cur_eval->u.eval.prev_rex;
6601 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6602 SET_reg_curpm(rex_sv);
6603 rex = ReANY(rex_sv);
6604 rexi = RXi_GET(rex);
6605 cur_curlyx = cur_eval->u.eval.prev_curlyx;
6607 REGCP_SET(st->u.eval.lastcp);
6609 /* Restore parens of the outer rex without popping the
6611 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
6614 st->u.eval.prev_eval = cur_eval;
6615 cur_eval = cur_eval->u.eval.prev_eval;
6617 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
6618 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
6619 if ( nochange_depth )
6622 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
6623 locinput); /* match B */
6626 if (locinput < reginfo->till) {
6627 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6628 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
6630 (long)(locinput - startpos),
6631 (long)(reginfo->till - startpos),
6634 sayNO_SILENT; /* Cannot match: too short. */
6636 sayYES; /* Success! */
6638 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
6640 PerlIO_printf(Perl_debug_log,
6641 "%*s %ssubpattern success...%s\n",
6642 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
6643 sayYES; /* Success! */
6646 #define ST st->u.ifmatch
6651 case SUSPEND: /* (?>A) */
6653 newstart = locinput;
6656 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
6658 goto ifmatch_trivial_fail_test;
6660 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
6662 ifmatch_trivial_fail_test:
6664 char * const s = HOPBACKc(locinput, scan->flags);
6669 sw = 1 - cBOOL(ST.wanted);
6673 next = scan + ARG(scan);
6681 newstart = locinput;
6685 ST.logical = logical;
6686 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
6688 /* execute body of (?...A) */
6689 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
6690 assert(0); /* NOTREACHED */
6693 case IFMATCH_A_fail: /* body of (?...A) failed */
6694 ST.wanted = !ST.wanted;
6697 case IFMATCH_A: /* body of (?...A) succeeded */
6699 sw = cBOOL(ST.wanted);
6701 else if (!ST.wanted)
6704 if (OP(ST.me) != SUSPEND) {
6705 /* restore old position except for (?>...) */
6706 locinput = st->locinput;
6708 scan = ST.me + ARG(ST.me);
6711 continue; /* execute B */
6715 case LONGJMP: /* alternative with many branches compiles to
6716 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
6717 next = scan + ARG(scan);
6722 case COMMIT: /* (*COMMIT) */
6723 reginfo->cutpoint = reginfo->strend;
6726 case PRUNE: /* (*PRUNE) */
6728 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6729 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
6730 assert(0); /* NOTREACHED */
6732 case COMMIT_next_fail:
6736 case OPFAIL: /* (*FAIL) */
6738 assert(0); /* NOTREACHED */
6740 #define ST st->u.mark
6741 case MARKPOINT: /* (*MARK:foo) */
6742 ST.prev_mark = mark_state;
6743 ST.mark_name = sv_commit = sv_yes_mark
6744 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6746 ST.mark_loc = locinput;
6747 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
6748 assert(0); /* NOTREACHED */
6750 case MARKPOINT_next:
6751 mark_state = ST.prev_mark;
6753 assert(0); /* NOTREACHED */
6755 case MARKPOINT_next_fail:
6756 if (popmark && sv_eq(ST.mark_name,popmark))
6758 if (ST.mark_loc > startpoint)
6759 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6760 popmark = NULL; /* we found our mark */
6761 sv_commit = ST.mark_name;
6764 PerlIO_printf(Perl_debug_log,
6765 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
6766 REPORT_CODE_OFF+depth*2, "",
6767 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
6770 mark_state = ST.prev_mark;
6771 sv_yes_mark = mark_state ?
6772 mark_state->u.mark.mark_name : NULL;
6774 assert(0); /* NOTREACHED */
6776 case SKIP: /* (*SKIP) */
6778 /* (*SKIP) : if we fail we cut here*/
6779 ST.mark_name = NULL;
6780 ST.mark_loc = locinput;
6781 PUSH_STATE_GOTO(SKIP_next,next, locinput);
6783 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
6784 otherwise do nothing. Meaning we need to scan
6786 regmatch_state *cur = mark_state;
6787 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
6790 if ( sv_eq( cur->u.mark.mark_name,
6793 ST.mark_name = find;
6794 PUSH_STATE_GOTO( SKIP_next, next, locinput);
6796 cur = cur->u.mark.prev_mark;
6799 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
6802 case SKIP_next_fail:
6804 /* (*CUT:NAME) - Set up to search for the name as we
6805 collapse the stack*/
6806 popmark = ST.mark_name;
6808 /* (*CUT) - No name, we cut here.*/
6809 if (ST.mark_loc > startpoint)
6810 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
6811 /* but we set sv_commit to latest mark_name if there
6812 is one so they can test to see how things lead to this
6815 sv_commit=mark_state->u.mark.mark_name;
6819 assert(0); /* NOTREACHED */
6822 case LNBREAK: /* \R */
6823 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
6830 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
6831 PTR2UV(scan), OP(scan));
6832 Perl_croak(aTHX_ "regexp memory corruption");
6834 /* this is a point to jump to in order to increment
6835 * locinput by one character */
6837 assert(!NEXTCHR_IS_EOS);
6839 locinput += PL_utf8skip[nextchr];
6840 /* locinput is allowed to go 1 char off the end, but not 2+ */
6841 if (locinput > reginfo->strend)
6850 /* switch break jumps here */
6851 scan = next; /* prepare to execute the next op and ... */
6852 continue; /* ... jump back to the top, reusing st */
6853 assert(0); /* NOTREACHED */
6856 /* push a state that backtracks on success */
6857 st->u.yes.prev_yes_state = yes_state;
6861 /* push a new regex state, then continue at scan */
6863 regmatch_state *newst;
6866 regmatch_state *cur = st;
6867 regmatch_state *curyes = yes_state;
6869 regmatch_slab *slab = PL_regmatch_slab;
6870 for (;curd > -1;cur--,curd--) {
6871 if (cur < SLAB_FIRST(slab)) {
6873 cur = SLAB_LAST(slab);
6875 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
6876 REPORT_CODE_OFF + 2 + depth * 2,"",
6877 curd, PL_reg_name[cur->resume_state],
6878 (curyes == cur) ? "yes" : ""
6881 curyes = cur->u.yes.prev_yes_state;
6884 DEBUG_STATE_pp("push")
6887 st->locinput = locinput;
6889 if (newst > SLAB_LAST(PL_regmatch_slab))
6890 newst = S_push_slab(aTHX);
6891 PL_regmatch_state = newst;
6893 locinput = pushinput;
6896 assert(0); /* NOTREACHED */
6901 * We get here only if there's trouble -- normally "case END" is
6902 * the terminating point.
6904 Perl_croak(aTHX_ "corrupted regexp pointers");
6910 /* we have successfully completed a subexpression, but we must now
6911 * pop to the state marked by yes_state and continue from there */
6912 assert(st != yes_state);
6914 while (st != yes_state) {
6916 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6917 PL_regmatch_slab = PL_regmatch_slab->prev;
6918 st = SLAB_LAST(PL_regmatch_slab);
6922 DEBUG_STATE_pp("pop (no final)");
6924 DEBUG_STATE_pp("pop (yes)");
6930 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
6931 || yes_state > SLAB_LAST(PL_regmatch_slab))
6933 /* not in this slab, pop slab */
6934 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
6935 PL_regmatch_slab = PL_regmatch_slab->prev;
6936 st = SLAB_LAST(PL_regmatch_slab);
6938 depth -= (st - yes_state);
6941 yes_state = st->u.yes.prev_yes_state;
6942 PL_regmatch_state = st;
6945 locinput= st->locinput;
6946 state_num = st->resume_state + no_final;
6947 goto reenter_switch;
6950 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
6951 PL_colors[4], PL_colors[5]));
6953 if (reginfo->info_aux_eval) {
6954 /* each successfully executed (?{...}) block does the equivalent of
6955 * local $^R = do {...}
6956 * When popping the save stack, all these locals would be undone;
6957 * bypass this by setting the outermost saved $^R to the latest
6959 /* I dont know if this is needed or works properly now.
6960 * see code related to PL_replgv elsewhere in this file.
6963 if (oreplsv != GvSV(PL_replgv))
6964 sv_setsv(oreplsv, GvSV(PL_replgv));
6971 PerlIO_printf(Perl_debug_log,
6972 "%*s %sfailed...%s\n",
6973 REPORT_CODE_OFF+depth*2, "",
6974 PL_colors[4], PL_colors[5])
6986 /* there's a previous state to backtrack to */
6988 if (st < SLAB_FIRST(PL_regmatch_slab)) {
6989 PL_regmatch_slab = PL_regmatch_slab->prev;
6990 st = SLAB_LAST(PL_regmatch_slab);
6992 PL_regmatch_state = st;
6993 locinput= st->locinput;
6995 DEBUG_STATE_pp("pop");
6997 if (yes_state == st)
6998 yes_state = st->u.yes.prev_yes_state;
7000 state_num = st->resume_state + 1; /* failure = success + 1 */
7001 goto reenter_switch;
7006 if (rex->intflags & PREGf_VERBARG_SEEN) {
7007 SV *sv_err = get_sv("REGERROR", 1);
7008 SV *sv_mrk = get_sv("REGMARK", 1);
7010 sv_commit = &PL_sv_no;
7012 sv_yes_mark = &PL_sv_yes;
7015 sv_commit = &PL_sv_yes;
7016 sv_yes_mark = &PL_sv_no;
7018 sv_setsv(sv_err, sv_commit);
7019 sv_setsv(sv_mrk, sv_yes_mark);
7023 if (last_pushed_cv) {
7026 PERL_UNUSED_VAR(SP);
7029 assert(!result || locinput - reginfo->strbeg >= 0);
7030 return result ? locinput - reginfo->strbeg : -1;
7034 - regrepeat - repeatedly match something simple, report how many
7036 * What 'simple' means is a node which can be the operand of a quantifier like
7039 * startposp - pointer a pointer to the start position. This is updated
7040 * to point to the byte following the highest successful
7042 * p - the regnode to be repeatedly matched against.
7043 * reginfo - struct holding match state, such as strend
7044 * max - maximum number of things to match.
7045 * depth - (for debugging) backtracking depth.
7048 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
7049 regmatch_info *const reginfo, I32 max, int depth)
7052 char *scan; /* Pointer to current position in target string */
7054 char *loceol = reginfo->strend; /* local version */
7055 I32 hardcount = 0; /* How many matches so far */
7056 bool utf8_target = reginfo->is_utf8_target;
7057 int to_complement = 0; /* Invert the result? */
7059 _char_class_number classnum;
7061 PERL_UNUSED_ARG(depth);
7064 PERL_ARGS_ASSERT_REGREPEAT;
7067 if (max == REG_INFTY)
7069 else if (! utf8_target && loceol - scan > max)
7070 loceol = scan + max;
7072 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
7073 * to the maximum of how far we should go in it (leaving it set to the real
7074 * end, if the maximum permissible would take us beyond that). This allows
7075 * us to make the loop exit condition that we haven't gone past <loceol> to
7076 * also mean that we haven't exceeded the max permissible count, saving a
7077 * test each time through the loop. But it assumes that the OP matches a
7078 * single byte, which is true for most of the OPs below when applied to a
7079 * non-UTF-8 target. Those relatively few OPs that don't have this
7080 * characteristic will have to compensate.
7082 * There is no adjustment for UTF-8 targets, as the number of bytes per
7083 * character varies. OPs will have to test both that the count is less
7084 * than the max permissible (using <hardcount> to keep track), and that we
7085 * are still within the bounds of the string (using <loceol>. A few OPs
7086 * match a single byte no matter what the encoding. They can omit the max
7087 * test if, for the UTF-8 case, they do the adjustment that was skipped
7090 * Thus, the code above sets things up for the common case; and exceptional
7091 * cases need extra work; the common case is to make sure <scan> doesn't
7092 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
7093 * count doesn't exceed the maximum permissible */
7098 while (scan < loceol && hardcount < max && *scan != '\n') {
7099 scan += UTF8SKIP(scan);
7103 while (scan < loceol && *scan != '\n')
7109 while (scan < loceol && hardcount < max) {
7110 scan += UTF8SKIP(scan);
7117 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
7118 if (utf8_target && loceol - scan > max) {
7120 /* <loceol> hadn't been adjusted in the UTF-8 case */
7128 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7132 /* Can use a simple loop if the pattern char to match on is invariant
7133 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
7134 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
7135 * true iff it doesn't matter if the argument is in UTF-8 or not */
7136 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
7137 if (utf8_target && loceol - scan > max) {
7138 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
7139 * since here, to match at all, 1 char == 1 byte */
7140 loceol = scan + max;
7142 while (scan < loceol && UCHARAT(scan) == c) {
7146 else if (reginfo->is_utf8_pat) {
7148 STRLEN scan_char_len;
7150 /* When both target and pattern are UTF-8, we have to do
7152 while (hardcount < max
7154 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
7155 && memEQ(scan, STRING(p), scan_char_len))
7157 scan += scan_char_len;
7161 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
7163 /* Target isn't utf8; convert the character in the UTF-8
7164 * pattern to non-UTF8, and do a simple loop */
7165 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
7166 while (scan < loceol && UCHARAT(scan) == c) {
7169 } /* else pattern char is above Latin1, can't possibly match the
7174 /* Here, the string must be utf8; pattern isn't, and <c> is
7175 * different in utf8 than not, so can't compare them directly.
7176 * Outside the loop, find the two utf8 bytes that represent c, and
7177 * then look for those in sequence in the utf8 string */
7178 U8 high = UTF8_TWO_BYTE_HI(c);
7179 U8 low = UTF8_TWO_BYTE_LO(c);
7181 while (hardcount < max
7182 && scan + 1 < loceol
7183 && UCHARAT(scan) == high
7184 && UCHARAT(scan + 1) == low)
7192 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
7193 assert(! reginfo->is_utf8_pat);
7196 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
7200 utf8_flags = FOLDEQ_LOCALE;
7203 case EXACTF: /* This node only generated for non-utf8 patterns */
7204 assert(! reginfo->is_utf8_pat);
7210 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
7214 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
7216 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
7218 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
7221 if (c1 == CHRTEST_VOID) {
7222 /* Use full Unicode fold matching */
7223 char *tmpeol = reginfo->strend;
7224 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
7225 while (hardcount < max
7226 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
7227 STRING(p), NULL, pat_len,
7228 reginfo->is_utf8_pat, utf8_flags))
7231 tmpeol = reginfo->strend;
7235 else if (utf8_target) {
7237 while (scan < loceol
7239 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
7241 scan += UTF8SKIP(scan);
7246 while (scan < loceol
7248 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
7249 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
7251 scan += UTF8SKIP(scan);
7256 else if (c1 == c2) {
7257 while (scan < loceol && UCHARAT(scan) == c1) {
7262 while (scan < loceol &&
7263 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
7273 while (hardcount < max
7275 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
7277 scan += UTF8SKIP(scan);
7281 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
7286 /* The argument (FLAGS) to all the POSIX node types is the class number */
7293 if (! utf8_target) {
7294 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
7300 while (hardcount < max && scan < loceol
7301 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
7304 scan += UTF8SKIP(scan);
7317 if (utf8_target && loceol - scan > max) {
7319 /* We didn't adjust <loceol> at the beginning of this routine
7320 * because is UTF-8, but it is actually ok to do so, since here, to
7321 * match, 1 char == 1 byte. */
7322 loceol = scan + max;
7324 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
7337 if (! utf8_target) {
7338 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
7344 /* The complement of something that matches only ASCII matches all
7345 * UTF-8 variant code points, plus everything in ASCII that isn't
7347 while (hardcount < max && scan < loceol
7348 && (! UTF8_IS_INVARIANT(*scan)
7349 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
7351 scan += UTF8SKIP(scan);
7362 if (! utf8_target) {
7363 while (scan < loceol && to_complement
7364 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
7371 classnum = (_char_class_number) FLAGS(p);
7372 if (classnum < _FIRST_NON_SWASH_CC) {
7374 /* Here, a swash is needed for above-Latin1 code points.
7375 * Process as many Latin1 code points using the built-in rules.
7376 * Go to another loop to finish processing upon encountering
7377 * the first Latin1 code point. We could do that in this loop
7378 * as well, but the other way saves having to test if the swash
7379 * has been loaded every time through the loop: extra space to
7381 while (hardcount < max && scan < loceol) {
7382 if (UTF8_IS_INVARIANT(*scan)) {
7383 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
7390 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
7391 if (! (to_complement
7392 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
7401 goto found_above_latin1;
7408 /* For these character classes, the knowledge of how to handle
7409 * every code point is compiled in to Perl via a macro. This
7410 * code is written for making the loops as tight as possible.
7411 * It could be refactored to save space instead */
7413 case _CC_ENUM_SPACE: /* XXX would require separate code
7414 if we revert the change of \v
7417 case _CC_ENUM_PSXSPC:
7418 while (hardcount < max
7420 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
7422 scan += UTF8SKIP(scan);
7426 case _CC_ENUM_BLANK:
7427 while (hardcount < max
7429 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
7431 scan += UTF8SKIP(scan);
7435 case _CC_ENUM_XDIGIT:
7436 while (hardcount < max
7438 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
7440 scan += UTF8SKIP(scan);
7444 case _CC_ENUM_VERTSPACE:
7445 while (hardcount < max
7447 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
7449 scan += UTF8SKIP(scan);
7453 case _CC_ENUM_CNTRL:
7454 while (hardcount < max
7456 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
7458 scan += UTF8SKIP(scan);
7463 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
7469 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
7471 /* Load the swash if not already present */
7472 if (! PL_utf8_swash_ptrs[classnum]) {
7473 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7474 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
7478 PL_XPosix_ptrs[classnum], &flags);
7481 while (hardcount < max && scan < loceol
7482 && to_complement ^ cBOOL(_generic_utf8(
7485 swash_fetch(PL_utf8_swash_ptrs[classnum],
7489 scan += UTF8SKIP(scan);
7496 while (hardcount < max && scan < loceol &&
7497 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
7502 /* LNBREAK can match one or two latin chars, which is ok, but we
7503 * have to use hardcount in this situation, and throw away the
7504 * adjustment to <loceol> done before the switch statement */
7505 loceol = reginfo->strend;
7506 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
7527 /* These are all 0 width, so match right here or not at all. */
7531 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
7532 assert(0); /* NOTREACHED */
7539 c = scan - *startposp;
7543 GET_RE_DEBUG_FLAGS_DECL;
7545 SV * const prop = sv_newmortal();
7546 regprop(prog, prop, p, reginfo);
7547 PerlIO_printf(Perl_debug_log,
7548 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
7549 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
7557 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
7559 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
7560 create a copy so that changes the caller makes won't change the shared one.
7561 If <altsvp> is non-null, will return NULL in it, for back-compat.
7564 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
7566 PERL_ARGS_ASSERT_REGCLASS_SWASH;
7572 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
7576 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
7577 const regnode* node,
7580 SV** only_utf8_locale_ptr)
7582 /* For internal core use only.
7583 * Returns the swash for the input 'node' in the regex 'prog'.
7584 * If <doinit> is 'true', will attempt to create the swash if not already
7586 * If <listsvp> is non-null, will return the printable contents of the
7587 * swash. This can be used to get debugging information even before the
7588 * swash exists, by calling this function with 'doinit' set to false, in
7589 * which case the components that will be used to eventually create the
7590 * swash are returned (in a printable form).
7591 * Tied intimately to how regcomp.c sets up the data structure */
7595 SV *si = NULL; /* Input swash initialization string */
7598 RXi_GET_DECL(prog,progi);
7599 const struct reg_data * const data = prog ? progi->data : NULL;
7601 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
7603 assert(ANYOF_FLAGS(node)
7604 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
7606 if (data && data->count) {
7607 const U32 n = ARG(node);
7609 if (data->what[n] == 's') {
7610 SV * const rv = MUTABLE_SV(data->data[n]);
7611 AV * const av = MUTABLE_AV(SvRV(rv));
7612 SV **const ary = AvARRAY(av);
7613 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
7615 si = *ary; /* ary[0] = the string to initialize the swash with */
7617 /* Elements 3 and 4 are either both present or both absent. [3] is
7618 * any inversion list generated at compile time; [4] indicates if
7619 * that inversion list has any user-defined properties in it. */
7620 if (av_tindex(av) >= 2) {
7621 if (only_utf8_locale_ptr
7623 && ary[2] != &PL_sv_undef)
7625 *only_utf8_locale_ptr = ary[2];
7628 *only_utf8_locale_ptr = NULL;
7631 if (av_tindex(av) >= 3) {
7634 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
7642 /* Element [1] is reserved for the set-up swash. If already there,
7643 * return it; if not, create it and store it there */
7644 if (ary[1] && SvROK(ary[1])) {
7647 else if (doinit && ((si && si != &PL_sv_undef)
7648 || (invlist && invlist != &PL_sv_undef))) {
7650 sw = _core_swash_init("utf8", /* the utf8 package */
7654 0, /* not from tr/// */
7657 (void)av_store(av, 1, sw);
7662 /* If requested, return a printable version of what this swash matches */
7664 SV* matches_string = newSVpvn("", 0);
7666 /* The swash should be used, if possible, to get the data, as it
7667 * contains the resolved data. But this function can be called at
7668 * compile-time, before everything gets resolved, in which case we
7669 * return the currently best available information, which is the string
7670 * that will eventually be used to do that resolving, 'si' */
7671 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
7672 && (si && si != &PL_sv_undef))
7674 sv_catsv(matches_string, si);
7677 /* Add the inversion list to whatever we have. This may have come from
7678 * the swash, or from an input parameter */
7680 sv_catsv(matches_string, _invlist_contents(invlist));
7682 *listsvp = matches_string;
7687 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
7690 - reginclass - determine if a character falls into a character class
7692 n is the ANYOF regnode
7693 p is the target string
7694 p_end points to one byte beyond the end of the target string
7695 utf8_target tells whether p is in UTF-8.
7697 Returns true if matched; false otherwise.
7699 Note that this can be a synthetic start class, a combination of various
7700 nodes, so things you think might be mutually exclusive, such as locale,
7701 aren't. It can match both locale and non-locale
7706 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
7709 const char flags = ANYOF_FLAGS(n);
7713 PERL_ARGS_ASSERT_REGINCLASS;
7715 /* If c is not already the code point, get it. Note that
7716 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
7717 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
7719 c = utf8n_to_uvchr(p, p_end - p, &c_len,
7720 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
7721 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
7722 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
7723 * UTF8_ALLOW_FFFF */
7724 if (c_len == (STRLEN)-1)
7725 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
7728 /* If this character is potentially in the bitmap, check it */
7730 if (ANYOF_BITMAP_TEST(n, c))
7732 else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
7738 else if (flags & ANYOF_LOCALE_FLAGS) {
7739 if (flags & ANYOF_LOC_FOLD) {
7740 if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
7744 if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
7746 /* The data structure is arranged so bits 0, 2, 4, ... are set
7747 * if the class includes the Posix character class given by
7748 * bit/2; and 1, 3, 5, ... are set if the class includes the
7749 * complemented Posix class given by int(bit/2). So we loop
7750 * through the bits, each time changing whether we complement
7751 * the result or not. Suppose for the sake of illustration
7752 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
7753 * is set, it means there is a match for this ANYOF node if the
7754 * character is in the class given by the expression (0 / 2 = 0
7755 * = \w). If it is in that class, isFOO_lc() will return 1,
7756 * and since 'to_complement' is 0, the result will stay TRUE,
7757 * and we exit the loop. Suppose instead that bit 0 is 0, but
7758 * bit 1 is 1. That means there is a match if the character
7759 * matches \W. We won't bother to call isFOO_lc() on bit 0,
7760 * but will on bit 1. On the second iteration 'to_complement'
7761 * will be 1, so the exclusive or will reverse things, so we
7762 * are testing for \W. On the third iteration, 'to_complement'
7763 * will be 0, and we would be testing for \s; the fourth
7764 * iteration would test for \S, etc.
7766 * Note that this code assumes that all the classes are closed
7767 * under folding. For example, if a character matches \w, then
7768 * its fold does too; and vice versa. This should be true for
7769 * any well-behaved locale for all the currently defined Posix
7770 * classes, except for :lower: and :upper:, which are handled
7771 * by the pseudo-class :cased: which matches if either of the
7772 * other two does. To get rid of this assumption, an outer
7773 * loop could be used below to iterate over both the source
7774 * character, and its fold (if different) */
7777 int to_complement = 0;
7779 while (count < ANYOF_MAX) {
7780 if (ANYOF_POSIXL_TEST(n, count)
7781 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
7794 /* If the bitmap didn't (or couldn't) match, and something outside the
7795 * bitmap could match, try that. */
7797 if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
7798 match = TRUE; /* Everything above 255 matches */
7800 else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
7801 || (utf8_target && (flags & ANYOF_UTF8))
7802 || ((flags & ANYOF_LOC_FOLD)
7803 && IN_UTF8_CTYPE_LOCALE
7804 && ARG(n) != ANYOF_NONBITMAP_EMPTY))
7806 SV* only_utf8_locale = NULL;
7807 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
7813 } else { /* Convert to utf8 */
7815 utf8_p = bytes_to_utf8(p, &len);
7818 if (swash_fetch(sw, utf8_p, TRUE)) {
7822 /* If we allocated a string above, free it */
7823 if (! utf8_target) Safefree(utf8_p);
7825 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
7826 match = _invlist_contains_cp(only_utf8_locale, c);
7830 if (UNICODE_IS_SUPER(c)
7831 && (flags & ANYOF_WARN_SUPER)
7832 && ckWARN_d(WARN_NON_UNICODE))
7834 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
7835 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
7839 #if ANYOF_INVERT != 1
7840 /* Depending on compiler optimization cBOOL takes time, so if don't have to
7842 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
7845 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
7846 return (flags & ANYOF_INVERT) ^ match;
7850 S_reghop3(U8 *s, SSize_t off, const U8* lim)
7852 /* return the position 'off' UTF-8 characters away from 's', forward if
7853 * 'off' >= 0, backwards if negative. But don't go outside of position
7854 * 'lim', which better be < s if off < 0 */
7858 PERL_ARGS_ASSERT_REGHOP3;
7861 while (off-- && s < lim) {
7862 /* XXX could check well-formedness here */
7867 while (off++ && s > lim) {
7869 if (UTF8_IS_CONTINUED(*s)) {
7870 while (s > lim && UTF8_IS_CONTINUATION(*s))
7873 /* XXX could check well-formedness here */
7880 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
7884 PERL_ARGS_ASSERT_REGHOP4;
7887 while (off-- && s < rlim) {
7888 /* XXX could check well-formedness here */
7893 while (off++ && s > llim) {
7895 if (UTF8_IS_CONTINUED(*s)) {
7896 while (s > llim && UTF8_IS_CONTINUATION(*s))
7899 /* XXX could check well-formedness here */
7905 /* like reghop3, but returns NULL on overrun, rather than returning last
7909 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
7913 PERL_ARGS_ASSERT_REGHOPMAYBE3;
7916 while (off-- && s < lim) {
7917 /* XXX could check well-formedness here */
7924 while (off++ && s > lim) {
7926 if (UTF8_IS_CONTINUED(*s)) {
7927 while (s > lim && UTF8_IS_CONTINUATION(*s))
7930 /* XXX could check well-formedness here */
7939 /* when executing a regex that may have (?{}), extra stuff needs setting
7940 up that will be visible to the called code, even before the current
7941 match has finished. In particular:
7943 * $_ is localised to the SV currently being matched;
7944 * pos($_) is created if necessary, ready to be updated on each call-out
7946 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
7947 isn't set until the current pattern is successfully finished), so that
7948 $1 etc of the match-so-far can be seen;
7949 * save the old values of subbeg etc of the current regex, and set then
7950 to the current string (again, this is normally only done at the end
7955 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
7958 regexp *const rex = ReANY(reginfo->prog);
7959 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
7961 eval_state->rex = rex;
7964 /* Make $_ available to executed code. */
7965 if (reginfo->sv != DEFSV) {
7967 DEFSV_set(reginfo->sv);
7970 if (!(mg = mg_find_mglob(reginfo->sv))) {
7971 /* prepare for quick setting of pos */
7972 mg = sv_magicext_mglob(reginfo->sv);
7975 eval_state->pos_magic = mg;
7976 eval_state->pos = mg->mg_len;
7977 eval_state->pos_flags = mg->mg_flags;
7980 eval_state->pos_magic = NULL;
7982 if (!PL_reg_curpm) {
7983 /* PL_reg_curpm is a fake PMOP that we can attach the current
7984 * regex to and point PL_curpm at, so that $1 et al are visible
7985 * within a /(?{})/. It's just allocated once per interpreter the
7986 * first time its needed */
7987 Newxz(PL_reg_curpm, 1, PMOP);
7990 SV* const repointer = &PL_sv_undef;
7991 /* this regexp is also owned by the new PL_reg_curpm, which
7992 will try to free it. */
7993 av_push(PL_regex_padav, repointer);
7994 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
7995 PL_regex_pad = AvARRAY(PL_regex_padav);
7999 SET_reg_curpm(reginfo->prog);
8000 eval_state->curpm = PL_curpm;
8001 PL_curpm = PL_reg_curpm;
8002 if (RXp_MATCH_COPIED(rex)) {
8003 /* Here is a serious problem: we cannot rewrite subbeg,
8004 since it may be needed if this match fails. Thus
8005 $` inside (?{}) could fail... */
8006 eval_state->subbeg = rex->subbeg;
8007 eval_state->sublen = rex->sublen;
8008 eval_state->suboffset = rex->suboffset;
8009 eval_state->subcoffset = rex->subcoffset;
8011 eval_state->saved_copy = rex->saved_copy;
8013 RXp_MATCH_COPIED_off(rex);
8016 eval_state->subbeg = NULL;
8017 rex->subbeg = (char *)reginfo->strbeg;
8019 rex->subcoffset = 0;
8020 rex->sublen = reginfo->strend - reginfo->strbeg;
8024 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8027 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8030 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8031 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
8034 Safefree(aux->poscache);
8038 /* undo the effects of S_setup_eval_state() */
8040 if (eval_state->subbeg) {
8041 regexp * const rex = eval_state->rex;
8042 rex->subbeg = eval_state->subbeg;
8043 rex->sublen = eval_state->sublen;
8044 rex->suboffset = eval_state->suboffset;
8045 rex->subcoffset = eval_state->subcoffset;
8047 rex->saved_copy = eval_state->saved_copy;
8049 RXp_MATCH_COPIED_on(rex);
8051 if (eval_state->pos_magic)
8053 eval_state->pos_magic->mg_len = eval_state->pos;
8054 eval_state->pos_magic->mg_flags =
8055 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8056 | (eval_state->pos_flags & MGf_BYTES);
8059 PL_curpm = eval_state->curpm;
8062 PL_regmatch_state = aux->old_regmatch_state;
8063 PL_regmatch_slab = aux->old_regmatch_slab;
8065 /* free all slabs above current one - this must be the last action
8066 * of this function, as aux and eval_state are allocated within
8067 * slabs and may be freed here */
8069 s = PL_regmatch_slab->next;
8071 PL_regmatch_slab->next = NULL;
8073 regmatch_slab * const osl = s;
8082 S_to_utf8_substr(pTHX_ regexp *prog)
8084 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8085 * on the converted value */
8089 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8092 if (prog->substrs->data[i].substr
8093 && !prog->substrs->data[i].utf8_substr) {
8094 SV* const sv = newSVsv(prog->substrs->data[i].substr);
8095 prog->substrs->data[i].utf8_substr = sv;
8096 sv_utf8_upgrade(sv);
8097 if (SvVALID(prog->substrs->data[i].substr)) {
8098 if (SvTAIL(prog->substrs->data[i].substr)) {
8099 /* Trim the trailing \n that fbm_compile added last
8101 SvCUR_set(sv, SvCUR(sv) - 1);
8102 /* Whilst this makes the SV technically "invalid" (as its
8103 buffer is no longer followed by "\0") when fbm_compile()
8104 adds the "\n" back, a "\0" is restored. */
8105 fbm_compile(sv, FBMcf_TAIL);
8109 if (prog->substrs->data[i].substr == prog->check_substr)
8110 prog->check_utf8 = sv;
8116 S_to_byte_substr(pTHX_ regexp *prog)
8118 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
8119 * on the converted value; returns FALSE if can't be converted. */
8124 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
8127 if (prog->substrs->data[i].utf8_substr
8128 && !prog->substrs->data[i].substr) {
8129 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
8130 if (! sv_utf8_downgrade(sv, TRUE)) {
8133 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
8134 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
8135 /* Trim the trailing \n that fbm_compile added last
8137 SvCUR_set(sv, SvCUR(sv) - 1);
8138 fbm_compile(sv, FBMcf_TAIL);
8142 prog->substrs->data[i].substr = sv;
8143 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
8144 prog->check_substr = sv;
8153 * c-indentation-style: bsd
8155 * indent-tabs-mode: nil
8158 * ex: set ts=8 sts=4 sw=4 et: