5 * One Ring to rule them all, One Ring to find them
7 * [p.v of _The Lord of the Rings_, opening poem]
8 * [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
9 * [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
12 /* This file contains functions for executing a regular expression. See
13 * also regcomp.c which funnily enough, contains functions for compiling
14 * a regular expression.
16 * This file is also copied at build time to ext/re/re_exec.c, where
17 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
18 * This causes the main functions to be compiled under new names and with
19 * debugging support added, which makes "use re 'debug'" work.
22 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
23 * confused with the original package (see point 3 below). Thanks, Henry!
26 /* Additional note: this code is very heavily munged from Henry's version
27 * in places. In some spots I've traded clarity for efficiency, so don't
28 * blame Henry for some of the lack of readability.
31 /* The names of the functions have been changed from regcomp and
32 * regexec to pregcomp and pregexec in order to avoid conflicts
33 * with the POSIX routines of the same names.
36 #ifdef PERL_EXT_RE_BUILD
40 #define B_ON_NON_UTF8_LOCALE_IS_WRONG \
41 "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
44 * pregcomp and pregexec -- regsub and regerror are not used in perl
46 * Copyright (c) 1986 by University of Toronto.
47 * Written by Henry Spencer. Not derived from licensed software.
49 * Permission is granted to anyone to use this software for any
50 * purpose on any computer system, and to redistribute it freely,
51 * subject to the following restrictions:
53 * 1. The author is not responsible for the consequences of use of
54 * this software, no matter how awful, even if they arise
57 * 2. The origin of this software must not be misrepresented, either
58 * by explicit claim or by omission.
60 * 3. Altered versions must be plainly marked as such, and must not
61 * be misrepresented as being the original software.
63 **** Alterations to Henry's code are...
65 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
66 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
67 **** by Larry Wall and others
69 **** You may distribute under the terms of either the GNU General Public
70 **** License or the Artistic License, as specified in the README file.
72 * Beware that some of this code is subtly aware of the way operator
73 * precedence is structured in regular expressions. Serious changes in
74 * regular-expression syntax might require a total rethink.
77 #define PERL_IN_REGEXEC_C
78 #undef PERL_IN_XSUB_RE
79 #define PERL_IN_XSUB_RE 1
82 #undef PERL_IN_XSUB_RE
84 #ifdef PERL_IN_XSUB_RE
90 #include "inline_invlist.c"
91 #include "unicode_constants.h"
94 /* At least one required character in the target string is expressible only in
96 static const char* const non_utf8_target_but_utf8_required
97 = "Can't match, because target string needs to be in UTF-8\n";
100 #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \
101 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\
105 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
108 #define STATIC static
111 /* Valid only for non-utf8 strings: avoids the reginclass
112 * call if there are no complications: i.e., if everything matchable is
113 * straight forward in the bitmap */
114 #define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,c+1,0) \
115 : ANYOF_BITMAP_TEST(p,*(c)))
121 #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv))
122 #define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b)
124 #define HOPc(pos,off) \
125 (char *)(reginfo->is_utf8_target \
126 ? reghop3((U8*)pos, off, \
127 (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \
130 #define HOPBACKc(pos, off) \
131 (char*)(reginfo->is_utf8_target \
132 ? reghopmaybe3((U8*)pos, -off, (U8*)(reginfo->strbeg)) \
133 : (pos - off >= reginfo->strbeg) \
137 #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off))
138 #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim))
140 /* lim must be +ve. Returns NULL on overshoot */
141 #define HOPMAYBE3(pos,off,lim) \
142 (reginfo->is_utf8_target \
143 ? reghopmaybe3((U8*)pos, off, (U8*)(lim)) \
144 : ((U8*)pos + off <= lim) \
148 /* like HOP3, but limits the result to <= lim even for the non-utf8 case.
149 * off must be >=0; args should be vars rather than expressions */
150 #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \
151 ? reghop3((U8*)(pos), off, (U8*)(lim)) \
152 : (U8*)((pos + off) > lim ? lim : (pos + off)))
154 #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \
155 ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \
157 #define HOP4c(pos,off,llim, rlim) ((char*)HOP4(pos,off,llim, rlim))
159 #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */
160 #define NEXTCHR_IS_EOS (nextchr < 0)
162 #define SET_nextchr \
163 nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS)
165 #define SET_locinput(p) \
170 #define LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist) STMT_START { \
172 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; \
173 swash_ptr = _core_swash_init("utf8", property_name, &PL_sv_undef, \
174 1, 0, invlist, &flags); \
179 /* If in debug mode, we test that a known character properly matches */
181 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
184 utf8_char_in_property) \
185 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist); \
186 assert(swash_fetch(swash_ptr, (U8 *) utf8_char_in_property, TRUE));
188 # define LOAD_UTF8_CHARCLASS_DEBUG_TEST(swash_ptr, \
191 utf8_char_in_property) \
192 LOAD_UTF8_CHARCLASS(swash_ptr, property_name, invlist)
195 #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \
196 PL_utf8_swash_ptrs[_CC_WORDCHAR], \
198 PL_XPosix_ptrs[_CC_WORDCHAR], \
199 LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
201 #define PLACEHOLDER /* Something for the preprocessor to grab onto */
202 /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
204 /* for use after a quantifier and before an EXACT-like node -- japhy */
205 /* it would be nice to rework regcomp.sym to generate this stuff. sigh
207 * NOTE that *nothing* that affects backtracking should be in here, specifically
208 * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a
209 * node that is in between two EXACT like nodes when ascertaining what the required
210 * "follow" character is. This should probably be moved to regex compile time
211 * although it may be done at run time beause of the REF possibility - more
212 * investigation required. -- demerphq
214 #define JUMPABLE(rn) ( \
216 (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \
218 OP(rn) == SUSPEND || OP(rn) == IFMATCH || \
219 OP(rn) == PLUS || OP(rn) == MINMOD || \
221 (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \
223 #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT)
225 #define HAS_TEXT(rn) ( IS_EXACT(rn) || PL_regkind[OP(rn)] == REF )
228 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
229 we don't need this definition. XXX These are now out-of-sync*/
230 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
231 #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
232 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
235 /* ... so we use this as its faster. */
236 #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
237 #define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
238 #define IS_TEXTF(rn) ( OP(rn)==EXACTF )
239 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
244 Search for mandatory following text node; for lookahead, the text must
245 follow but for lookbehind (rn->flags != 0) we skip to the next step.
247 #define FIND_NEXT_IMPT(rn) STMT_START { \
248 while (JUMPABLE(rn)) { \
249 const OPCODE type = OP(rn); \
250 if (type == SUSPEND || PL_regkind[type] == CURLY) \
251 rn = NEXTOPER(NEXTOPER(rn)); \
252 else if (type == PLUS) \
254 else if (type == IFMATCH) \
255 rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \
256 else rn += NEXT_OFF(rn); \
260 #define SLAB_FIRST(s) (&(s)->states[0])
261 #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1])
263 static void S_setup_eval_state(pTHX_ regmatch_info *const reginfo);
264 static void S_cleanup_regmatch_info_aux(pTHX_ void *arg);
265 static regmatch_state * S_push_slab(pTHX);
267 #define REGCP_PAREN_ELEMS 3
268 #define REGCP_OTHER_ELEMS 3
269 #define REGCP_FRAME_ELEMS 1
270 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and
271 * are needed for the regexp context stack bookkeeping. */
274 S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen)
276 const int retval = PL_savestack_ix;
277 const int paren_elems_to_push =
278 (maxopenparen - parenfloor) * REGCP_PAREN_ELEMS;
279 const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS;
280 const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT;
282 GET_RE_DEBUG_FLAGS_DECL;
284 PERL_ARGS_ASSERT_REGCPPUSH;
286 if (paren_elems_to_push < 0)
287 Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0, maxopenparen: %i parenfloor: %i REGCP_PAREN_ELEMS: %u",
288 (int)paren_elems_to_push, (int)maxopenparen,
289 (int)parenfloor, (unsigned)REGCP_PAREN_ELEMS);
291 if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems)
292 Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf
293 " out of range (%lu-%ld)",
295 (unsigned long)maxopenparen,
298 SSGROW(total_elems + REGCP_FRAME_ELEMS);
301 if ((int)maxopenparen > (int)parenfloor)
302 PerlIO_printf(Perl_debug_log,
303 "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n",
308 for (p = parenfloor+1; p <= (I32)maxopenparen; p++) {
309 /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */
310 SSPUSHIV(rex->offs[p].end);
311 SSPUSHIV(rex->offs[p].start);
312 SSPUSHINT(rex->offs[p].start_tmp);
313 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
314 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n",
316 (IV)rex->offs[p].start,
317 (IV)rex->offs[p].start_tmp,
321 /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */
322 SSPUSHINT(maxopenparen);
323 SSPUSHINT(rex->lastparen);
324 SSPUSHINT(rex->lastcloseparen);
325 SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */
330 /* These are needed since we do not localize EVAL nodes: */
331 #define REGCP_SET(cp) \
333 PerlIO_printf(Perl_debug_log, \
334 " Setting an EVAL scope, savestack=%"IVdf"\n", \
335 (IV)PL_savestack_ix)); \
338 #define REGCP_UNWIND(cp) \
340 if (cp != PL_savestack_ix) \
341 PerlIO_printf(Perl_debug_log, \
342 " Clearing an EVAL scope, savestack=%"IVdf"..%"IVdf"\n", \
343 (IV)(cp), (IV)PL_savestack_ix)); \
346 #define UNWIND_PAREN(lp, lcp) \
347 for (n = rex->lastparen; n > lp; n--) \
348 rex->offs[n].end = -1; \
349 rex->lastparen = n; \
350 rex->lastcloseparen = lcp;
354 S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p)
358 GET_RE_DEBUG_FLAGS_DECL;
360 PERL_ARGS_ASSERT_REGCPPOP;
362 /* Pop REGCP_OTHER_ELEMS before the parentheses loop starts. */
364 assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */
365 i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */
366 rex->lastcloseparen = SSPOPINT;
367 rex->lastparen = SSPOPINT;
368 *maxopenparen_p = SSPOPINT;
370 i -= REGCP_OTHER_ELEMS;
371 /* Now restore the parentheses context. */
373 if (i || rex->lastparen + 1 <= rex->nparens)
374 PerlIO_printf(Perl_debug_log,
375 "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n",
380 paren = *maxopenparen_p;
381 for ( ; i > 0; i -= REGCP_PAREN_ELEMS) {
383 rex->offs[paren].start_tmp = SSPOPINT;
384 rex->offs[paren].start = SSPOPIV;
386 if (paren <= rex->lastparen)
387 rex->offs[paren].end = tmps;
388 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
389 " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n",
391 (IV)rex->offs[paren].start,
392 (IV)rex->offs[paren].start_tmp,
393 (IV)rex->offs[paren].end,
394 (paren > rex->lastparen ? "(skipped)" : ""));
399 /* It would seem that the similar code in regtry()
400 * already takes care of this, and in fact it is in
401 * a better location to since this code can #if 0-ed out
402 * but the code in regtry() is needed or otherwise tests
403 * requiring null fields (pat.t#187 and split.t#{13,14}
404 * (as of patchlevel 7877) will fail. Then again,
405 * this code seems to be necessary or otherwise
406 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
407 * --jhi updated by dapm */
408 for (i = rex->lastparen + 1; i <= rex->nparens; i++) {
409 if (i > *maxopenparen_p)
410 rex->offs[i].start = -1;
411 rex->offs[i].end = -1;
412 DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log,
413 " \\%"UVuf": %s ..-1 undeffing\n",
415 (i > *maxopenparen_p) ? "-1" : " "
421 /* restore the parens and associated vars at savestack position ix,
422 * but without popping the stack */
425 S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p)
427 I32 tmpix = PL_savestack_ix;
428 PL_savestack_ix = ix;
429 regcppop(rex, maxopenparen_p);
430 PL_savestack_ix = tmpix;
433 #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */
436 S_isFOO_lc(pTHX_ const U8 classnum, const U8 character)
438 /* Returns a boolean as to whether or not 'character' is a member of the
439 * Posix character class given by 'classnum' that should be equivalent to a
440 * value in the typedef '_char_class_number'.
442 * Ideally this could be replaced by a just an array of function pointers
443 * to the C library functions that implement the macros this calls.
444 * However, to compile, the precise function signatures are required, and
445 * these may vary from platform to to platform. To avoid having to figure
446 * out what those all are on each platform, I (khw) am using this method,
447 * which adds an extra layer of function call overhead (unless the C
448 * optimizer strips it away). But we don't particularly care about
449 * performance with locales anyway. */
451 switch ((_char_class_number) classnum) {
452 case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
453 case _CC_ENUM_ALPHA: return isALPHA_LC(character);
454 case _CC_ENUM_ASCII: return isASCII_LC(character);
455 case _CC_ENUM_BLANK: return isBLANK_LC(character);
456 case _CC_ENUM_CASED: return isLOWER_LC(character)
457 || isUPPER_LC(character);
458 case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
459 case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
460 case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
461 case _CC_ENUM_LOWER: return isLOWER_LC(character);
462 case _CC_ENUM_PRINT: return isPRINT_LC(character);
463 case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
464 case _CC_ENUM_SPACE: return isSPACE_LC(character);
465 case _CC_ENUM_UPPER: return isUPPER_LC(character);
466 case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
467 case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
468 default: /* VERTSPACE should never occur in locales */
469 Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
472 NOT_REACHED; /* NOTREACHED */
477 S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character)
479 /* Returns a boolean as to whether or not the (well-formed) UTF-8-encoded
480 * 'character' is a member of the Posix character class given by 'classnum'
481 * that should be equivalent to a value in the typedef
482 * '_char_class_number'.
484 * This just calls isFOO_lc on the code point for the character if it is in
485 * the range 0-255. Outside that range, all characters use Unicode
486 * rules, ignoring any locale. So use the Unicode function if this class
487 * requires a swash, and use the Unicode macro otherwise. */
489 PERL_ARGS_ASSERT_ISFOO_UTF8_LC;
491 if (UTF8_IS_INVARIANT(*character)) {
492 return isFOO_lc(classnum, *character);
494 else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
495 return isFOO_lc(classnum,
496 TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
499 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
501 if (classnum < _FIRST_NON_SWASH_CC) {
503 /* Initialize the swash unless done already */
504 if (! PL_utf8_swash_ptrs[classnum]) {
505 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
506 PL_utf8_swash_ptrs[classnum] =
507 _core_swash_init("utf8",
510 PL_XPosix_ptrs[classnum], &flags);
513 return cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum], (U8 *)
515 TRUE /* is UTF */ ));
518 switch ((_char_class_number) classnum) {
519 case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
520 case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
521 case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
522 case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
526 return FALSE; /* Things like CNTRL are always below 256 */
530 * pregexec and friends
533 #ifndef PERL_IN_XSUB_RE
535 - pregexec - match a regexp against a string
538 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend,
539 char *strbeg, SSize_t minend, SV *screamer, U32 nosave)
540 /* stringarg: the point in the string at which to begin matching */
541 /* strend: pointer to null at end of string */
542 /* strbeg: real beginning of string */
543 /* minend: end of match must be >= minend bytes after stringarg. */
544 /* screamer: SV being matched: only used for utf8 flag, pos() etc; string
545 * itself is accessed via the pointers above */
546 /* nosave: For optimizations. */
548 PERL_ARGS_ASSERT_PREGEXEC;
551 regexec_flags(prog, stringarg, strend, strbeg, minend, screamer, NULL,
552 nosave ? 0 : REXEC_COPY_STR);
558 /* re_intuit_start():
560 * Based on some optimiser hints, try to find the earliest position in the
561 * string where the regex could match.
563 * rx: the regex to match against
564 * sv: the SV being matched: only used for utf8 flag; the string
565 * itself is accessed via the pointers below. Note that on
566 * something like an overloaded SV, SvPOK(sv) may be false
567 * and the string pointers may point to something unrelated to
569 * strbeg: real beginning of string
570 * strpos: the point in the string at which to begin matching
571 * strend: pointer to the byte following the last char of the string
572 * flags currently unused; set to 0
573 * data: currently unused; set to NULL
575 * The basic idea of re_intuit_start() is to use some known information
576 * about the pattern, namely:
578 * a) the longest known anchored substring (i.e. one that's at a
579 * constant offset from the beginning of the pattern; but not
580 * necessarily at a fixed offset from the beginning of the
582 * b) the longest floating substring (i.e. one that's not at a constant
583 * offset from the beginning of the pattern);
584 * c) Whether the pattern is anchored to the string; either
585 * an absolute anchor: /^../, or anchored to \n: /^.../m,
586 * or anchored to pos(): /\G/;
587 * d) A start class: a real or synthetic character class which
588 * represents which characters are legal at the start of the pattern;
590 * to either quickly reject the match, or to find the earliest position
591 * within the string at which the pattern might match, thus avoiding
592 * running the full NFA engine at those earlier locations, only to
593 * eventually fail and retry further along.
595 * Returns NULL if the pattern can't match, or returns the address within
596 * the string which is the earliest place the match could occur.
598 * The longest of the anchored and floating substrings is called 'check'
599 * and is checked first. The other is called 'other' and is checked
600 * second. The 'other' substring may not be present. For example,
602 * /(abc|xyz)ABC\d{0,3}DEFG/
606 * check substr (float) = "DEFG", offset 6..9 chars
607 * other substr (anchored) = "ABC", offset 3..3 chars
610 * Be aware that during the course of this function, sometimes 'anchored'
611 * refers to a substring being anchored relative to the start of the
612 * pattern, and sometimes to the pattern itself being anchored relative to
613 * the string. For example:
615 * /\dabc/: "abc" is anchored to the pattern;
616 * /^\dabc/: "abc" is anchored to the pattern and the string;
617 * /\d+abc/: "abc" is anchored to neither the pattern nor the string;
618 * /^\d+abc/: "abc" is anchored to neither the pattern nor the string,
619 * but the pattern is anchored to the string.
623 Perl_re_intuit_start(pTHX_
626 const char * const strbeg,
630 re_scream_pos_data *data)
632 struct regexp *const prog = ReANY(rx);
633 SSize_t start_shift = prog->check_offset_min;
634 /* Should be nonnegative! */
635 SSize_t end_shift = 0;
636 /* current lowest pos in string where the regex can start matching */
637 char *rx_origin = strpos;
639 const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */
640 U8 other_ix = 1 - prog->substrs->check_ix;
642 char *other_last = strpos;/* latest pos 'other' substr already checked to */
643 char *check_at = NULL; /* check substr found at this pos */
644 const I32 multiline = prog->extflags & RXf_PMf_MULTILINE;
645 RXi_GET_DECL(prog,progi);
646 regmatch_info reginfo_buf; /* create some info to pass to find_byclass */
647 regmatch_info *const reginfo = ®info_buf;
648 GET_RE_DEBUG_FLAGS_DECL;
650 PERL_ARGS_ASSERT_RE_INTUIT_START;
651 PERL_UNUSED_ARG(flags);
652 PERL_UNUSED_ARG(data);
654 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
655 "Intuit: trying to determine minimum start position...\n"));
657 /* for now, assume that all substr offsets are positive. If at some point
658 * in the future someone wants to do clever things with look-behind and
659 * -ve offsets, they'll need to fix up any code in this function
660 * which uses these offsets. See the thread beginning
661 * <20140113145929.GF27210@iabyn.com>
663 assert(prog->substrs->data[0].min_offset >= 0);
664 assert(prog->substrs->data[0].max_offset >= 0);
665 assert(prog->substrs->data[1].min_offset >= 0);
666 assert(prog->substrs->data[1].max_offset >= 0);
667 assert(prog->substrs->data[2].min_offset >= 0);
668 assert(prog->substrs->data[2].max_offset >= 0);
670 /* for now, assume that if both present, that the floating substring
671 * doesn't start before the anchored substring.
672 * If you break this assumption (e.g. doing better optimisations
673 * with lookahead/behind), then you'll need to audit the code in this
674 * function carefully first
677 ! ( (prog->anchored_utf8 || prog->anchored_substr)
678 && (prog->float_utf8 || prog->float_substr))
679 || (prog->float_min_offset >= prog->anchored_offset));
681 /* byte rather than char calculation for efficiency. It fails
682 * to quickly reject some cases that can't match, but will reject
683 * them later after doing full char arithmetic */
684 if (prog->minlen > strend - strpos) {
685 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
686 " String too short...\n"));
690 RX_MATCH_UTF8_set(rx,utf8_target);
691 reginfo->is_utf8_target = cBOOL(utf8_target);
692 reginfo->info_aux = NULL;
693 reginfo->strbeg = strbeg;
694 reginfo->strend = strend;
695 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
697 /* not actually used within intuit, but zero for safety anyway */
698 reginfo->poscache_maxiter = 0;
701 if (!prog->check_utf8 && prog->check_substr)
702 to_utf8_substr(prog);
703 check = prog->check_utf8;
705 if (!prog->check_substr && prog->check_utf8) {
706 if (! to_byte_substr(prog)) {
707 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(fail);
710 check = prog->check_substr;
713 /* dump the various substring data */
714 DEBUG_OPTIMISE_MORE_r({
716 for (i=0; i<=2; i++) {
717 SV *sv = (utf8_target ? prog->substrs->data[i].utf8_substr
718 : prog->substrs->data[i].substr);
722 PerlIO_printf(Perl_debug_log,
723 " substrs[%d]: min=%"IVdf" max=%"IVdf" end shift=%"IVdf
724 " useful=%"IVdf" utf8=%d [%s]\n",
726 (IV)prog->substrs->data[i].min_offset,
727 (IV)prog->substrs->data[i].max_offset,
728 (IV)prog->substrs->data[i].end_shift,
735 if (prog->intflags & PREGf_ANCH) { /* Match at \G, beg-of-str or after \n */
737 /* ml_anch: check after \n?
739 * A note about PREGf_IMPLICIT: on an un-anchored pattern beginning
740 * with /.*.../, these flags will have been added by the
742 * /.*abc/, /.*abc/m: PREGf_IMPLICIT | PREGf_ANCH_MBOL
743 * /.*abc/s: PREGf_IMPLICIT | PREGf_ANCH_SBOL
745 ml_anch = (prog->intflags & PREGf_ANCH_MBOL)
746 && !(prog->intflags & PREGf_IMPLICIT);
748 if (!ml_anch && !(prog->intflags & PREGf_IMPLICIT)) {
749 /* we are only allowed to match at BOS or \G */
751 /* trivially reject if there's a BOS anchor and we're not at BOS.
753 * Note that we don't try to do a similar quick reject for
754 * \G, since generally the caller will have calculated strpos
755 * based on pos() and gofs, so the string is already correctly
756 * anchored by definition; and handling the exceptions would
757 * be too fiddly (e.g. REXEC_IGNOREPOS).
759 if ( strpos != strbeg
760 && (prog->intflags & PREGf_ANCH_SBOL))
762 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
763 " Not at start...\n"));
767 /* in the presence of an anchor, the anchored (relative to the
768 * start of the regex) substr must also be anchored relative
769 * to strpos. So quickly reject if substr isn't found there.
770 * This works for \G too, because the caller will already have
771 * subtracted gofs from pos, and gofs is the offset from the
772 * \G to the start of the regex. For example, in /.abc\Gdef/,
773 * where substr="abcdef", pos()=3, gofs=4, offset_min=1:
774 * caller will have set strpos=pos()-4; we look for the substr
775 * at position pos()-4+1, which lines up with the "a" */
777 if (prog->check_offset_min == prog->check_offset_max
778 && !(prog->intflags & PREGf_CANY_SEEN))
780 /* Substring at constant offset from beg-of-str... */
781 SSize_t slen = SvCUR(check);
782 char *s = HOP3c(strpos, prog->check_offset_min, strend);
784 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
785 " Looking for check substr at fixed offset %"IVdf"...\n",
786 (IV)prog->check_offset_min));
789 /* In this case, the regex is anchored at the end too.
790 * Unless it's a multiline match, the lengths must match
791 * exactly, give or take a \n. NB: slen >= 1 since
792 * the last char of check is \n */
794 && ( strend - s > slen
795 || strend - s < slen - 1
796 || (strend - s == slen && strend[-1] != '\n')))
798 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
799 " String too long...\n"));
802 /* Now should match s[0..slen-2] */
805 if (slen && (*SvPVX_const(check) != *s
806 || (slen > 1 && memNE(SvPVX_const(check), s, slen))))
808 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
809 " String not equal...\n"));
814 goto success_at_start;
819 end_shift = prog->check_end_shift;
821 #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */
823 Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ",
824 (IV)end_shift, RX_PRECOMP(prog));
829 /* This is the (re)entry point of the main loop in this function.
830 * The goal of this loop is to:
831 * 1) find the "check" substring in the region rx_origin..strend
832 * (adjusted by start_shift / end_shift). If not found, reject
834 * 2) If it exists, look for the "other" substr too if defined; for
835 * example, if the check substr maps to the anchored substr, then
836 * check the floating substr, and vice-versa. If not found, go
837 * back to (1) with rx_origin suitably incremented.
838 * 3) If we find an rx_origin position that doesn't contradict
839 * either of the substrings, then check the possible additional
840 * constraints on rx_origin of /^.../m or a known start class.
841 * If these fail, then depending on which constraints fail, jump
842 * back to here, or to various other re-entry points further along
843 * that skip some of the first steps.
844 * 4) If we pass all those tests, update the BmUSEFUL() count on the
845 * substring. If the start position was determined to be at the
846 * beginning of the string - so, not rejected, but not optimised,
847 * since we have to run regmatch from position 0 - decrement the
848 * BmUSEFUL() count. Otherwise increment it.
852 /* first, look for the 'check' substring */
858 DEBUG_OPTIMISE_MORE_r({
859 PerlIO_printf(Perl_debug_log,
860 " At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
861 " Start shift: %"IVdf" End shift %"IVdf
862 " Real end Shift: %"IVdf"\n",
863 (IV)(rx_origin - strbeg),
864 (IV)prog->check_offset_min,
867 (IV)prog->check_end_shift);
870 if (prog->intflags & PREGf_CANY_SEEN) {
871 start_point= (U8*)(rx_origin + start_shift);
872 end_point= (U8*)(strend - end_shift);
873 if (start_point > end_point)
876 end_point = HOP3(strend, -end_shift, strbeg);
877 start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
883 /* If the regex is absolutely anchored to either the start of the
884 * string (SBOL) or to pos() (ANCH_GPOS), then
885 * check_offset_max represents an upper bound on the string where
886 * the substr could start. For the ANCH_GPOS case, we assume that
887 * the caller of intuit will have already set strpos to
888 * pos()-gofs, so in this case strpos + offset_max will still be
889 * an upper bound on the substr.
892 && prog->intflags & PREGf_ANCH
893 && prog->check_offset_max != SSize_t_MAX)
895 SSize_t len = SvCUR(check) - !!SvTAIL(check);
896 const char * const anchor =
897 (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg);
899 /* do a bytes rather than chars comparison. It's conservative;
900 * so it skips doing the HOP if the result can't possibly end
901 * up earlier than the old value of end_point.
903 if ((char*)end_point - anchor > prog->check_offset_max) {
904 end_point = HOP3lim((U8*)anchor,
905 prog->check_offset_max,
911 check_at = fbm_instr( start_point, end_point,
912 check, multiline ? FBMrf_MULTILINE : 0);
914 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
915 " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
916 (IV)((char*)start_point - strbeg),
917 (IV)((char*)end_point - strbeg),
918 (IV)(check_at ? check_at - strbeg : -1)
921 /* Update the count-of-usability, remove useless subpatterns,
925 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
926 SvPVX_const(check), RE_SV_DUMPLEN(check), 30);
927 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s%s",
928 (check_at ? "Found" : "Did not find"),
929 (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr)
930 ? "anchored" : "floating"),
933 (check_at ? " at offset " : "...\n") );
938 /* set rx_origin to the minimum position where the regex could start
939 * matching, given the constraint of the just-matched check substring.
940 * But don't set it lower than previously.
943 if (check_at - rx_origin > prog->check_offset_max)
944 rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
945 /* Finish the diagnostic message */
946 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
947 "%ld (rx_origin now %"IVdf")...\n",
948 (long)(check_at - strbeg),
949 (IV)(rx_origin - strbeg)
954 /* now look for the 'other' substring if defined */
956 if (utf8_target ? prog->substrs->data[other_ix].utf8_substr
957 : prog->substrs->data[other_ix].substr)
959 /* Take into account the "other" substring. */
963 struct reg_substr_datum *other;
966 other = &prog->substrs->data[other_ix];
968 /* if "other" is anchored:
969 * we've previously found a floating substr starting at check_at.
970 * This means that the regex origin must lie somewhere
971 * between min (rx_origin): HOP3(check_at, -check_offset_max)
972 * and max: HOP3(check_at, -check_offset_min)
973 * (except that min will be >= strpos)
974 * So the fixed substr must lie somewhere between
975 * HOP3(min, anchored_offset)
976 * HOP3(max, anchored_offset) + SvCUR(substr)
979 /* if "other" is floating
980 * Calculate last1, the absolute latest point where the
981 * floating substr could start in the string, ignoring any
982 * constraints from the earlier fixed match. It is calculated
985 * strend - prog->minlen (in chars) is the absolute latest
986 * position within the string where the origin of the regex
987 * could appear. The latest start point for the floating
988 * substr is float_min_offset(*) on from the start of the
989 * regex. last1 simply combines thee two offsets.
991 * (*) You might think the latest start point should be
992 * float_max_offset from the regex origin, and technically
993 * you'd be correct. However, consider
995 * Here, float min, max are 3,5 and minlen is 7.
996 * This can match either
1000 * In the first case, the regex matches minlen chars; in the
1001 * second, minlen+1, in the third, minlen+2.
1002 * In the first case, the floating offset is 3 (which equals
1003 * float_min), in the second, 4, and in the third, 5 (which
1004 * equals float_max). In all cases, the floating string bcd
1005 * can never start more than 4 chars from the end of the
1006 * string, which equals minlen - float_min. As the substring
1007 * starts to match more than float_min from the start of the
1008 * regex, it makes the regex match more than minlen chars,
1009 * and the two cancel each other out. So we can always use
1010 * float_min - minlen, rather than float_max - minlen for the
1011 * latest position in the string.
1013 * Note that -minlen + float_min_offset is equivalent (AFAIKT)
1014 * to CHR_SVLEN(must) - !!SvTAIL(must) + prog->float_end_shift
1017 assert(prog->minlen >= other->min_offset);
1018 last1 = HOP3c(strend,
1019 other->min_offset - prog->minlen, strbeg);
1021 if (other_ix) {/* i.e. if (other-is-float) */
1022 /* last is the latest point where the floating substr could
1023 * start, *given* any constraints from the earlier fixed
1024 * match. This constraint is that the floating string starts
1025 * <= float_max_offset chars from the regex origin (rx_origin).
1026 * If this value is less than last1, use it instead.
1028 assert(rx_origin <= last1);
1030 /* this condition handles the offset==infinity case, and
1031 * is a short-cut otherwise. Although it's comparing a
1032 * byte offset to a char length, it does so in a safe way,
1033 * since 1 char always occupies 1 or more bytes,
1034 * so if a string range is (last1 - rx_origin) bytes,
1035 * it will be less than or equal to (last1 - rx_origin)
1036 * chars; meaning it errs towards doing the accurate HOP3
1037 * rather than just using last1 as a short-cut */
1038 (last1 - rx_origin) < other->max_offset
1040 : (char*)HOP3lim(rx_origin, other->max_offset, last1);
1043 assert(strpos + start_shift <= check_at);
1044 last = HOP4c(check_at, other->min_offset - start_shift,
1048 s = HOP3c(rx_origin, other->min_offset, strend);
1049 if (s < other_last) /* These positions already checked */
1052 must = utf8_target ? other->utf8_substr : other->substr;
1053 assert(SvPOK(must));
1056 char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
1060 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1061 " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
1062 (IV)(from - strbeg),
1068 (unsigned char*)from,
1071 multiline ? FBMrf_MULTILINE : 0
1073 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1074 " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
1075 (IV)(from - strbeg),
1077 (IV)(s ? s - strbeg : -1)
1083 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
1084 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
1085 PerlIO_printf(Perl_debug_log, " %s %s substr %s%s",
1086 s ? "Found" : "Contradicts",
1087 other_ix ? "floating" : "anchored",
1088 quoted, RE_SV_TAIL(must));
1093 /* last1 is latest possible substr location. If we didn't
1094 * find it before there, we never will */
1095 if (last >= last1) {
1096 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1097 "; giving up...\n"));
1101 /* try to find the check substr again at a later
1102 * position. Maybe next time we'll find the "other" substr
1104 other_last = HOP3c(last, 1, strend) /* highest failure */;
1106 other_ix /* i.e. if other-is-float */
1107 ? HOP3c(rx_origin, 1, strend)
1108 : HOP4c(last, 1 - other->min_offset, strbeg, strend);
1109 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1110 "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
1111 (other_ix ? "floating" : "anchored"),
1112 (long)(HOP3c(check_at, 1, strend) - strbeg),
1113 (IV)(rx_origin - strbeg)
1118 if (other_ix) { /* if (other-is-float) */
1119 /* other_last is set to s, not s+1, since its possible for
1120 * a floating substr to fail first time, then succeed
1121 * second time at the same floating position; e.g.:
1122 * "-AB--AABZ" =~ /\wAB\d*Z/
1123 * The first time round, anchored and float match at
1124 * "-(AB)--AAB(Z)" then fail on the initial \w character
1125 * class. Second time round, they match at "-AB--A(AB)(Z)".
1130 rx_origin = HOP3c(s, -other->min_offset, strbeg);
1131 other_last = HOP3c(s, 1, strend);
1133 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1134 " at offset %ld (rx_origin now %"IVdf")...\n",
1136 (IV)(rx_origin - strbeg)
1142 DEBUG_OPTIMISE_MORE_r(
1143 PerlIO_printf(Perl_debug_log,
1144 " Check-only match: offset min:%"IVdf" max:%"IVdf
1145 " check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
1146 " strend:%"IVdf"\n",
1147 (IV)prog->check_offset_min,
1148 (IV)prog->check_offset_max,
1149 (IV)(check_at-strbeg),
1150 (IV)(rx_origin-strbeg),
1151 (IV)(rx_origin-check_at),
1157 postprocess_substr_matches:
1159 /* handle the extra constraint of /^.../m if present */
1161 if (ml_anch && rx_origin != strbeg && rx_origin[-1] != '\n') {
1164 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1165 " looking for /^/m anchor"));
1167 /* we have failed the constraint of a \n before rx_origin.
1168 * Find the next \n, if any, even if it's beyond the current
1169 * anchored and/or floating substrings. Whether we should be
1170 * scanning ahead for the next \n or the next substr is debatable.
1171 * On the one hand you'd expect rare substrings to appear less
1172 * often than \n's. On the other hand, searching for \n means
1173 * we're effectively flipping between check_substr and "\n" on each
1174 * iteration as the current "rarest" string candidate, which
1175 * means for example that we'll quickly reject the whole string if
1176 * hasn't got a \n, rather than trying every substr position
1180 s = HOP3c(strend, - prog->minlen, strpos);
1181 if (s <= rx_origin ||
1182 ! ( rx_origin = (char *)memchr(rx_origin, '\n', s - rx_origin)))
1184 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1185 " Did not find /%s^%s/m...\n",
1186 PL_colors[0], PL_colors[1]));
1190 /* earliest possible origin is 1 char after the \n.
1191 * (since *rx_origin == '\n', it's safe to ++ here rather than
1192 * HOP(rx_origin, 1)) */
1195 if (prog->substrs->check_ix == 0 /* check is anchored */
1196 || rx_origin >= HOP3c(check_at, - prog->check_offset_min, strpos))
1198 /* Position contradicts check-string; either because
1199 * check was anchored (and thus has no wiggle room),
1200 * or check was float and rx_origin is above the float range */
1201 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1202 " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
1203 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1207 /* if we get here, the check substr must have been float,
1208 * is in range, and we may or may not have had an anchored
1209 * "other" substr which still contradicts */
1210 assert(prog->substrs->check_ix); /* check is float */
1212 if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) {
1213 /* whoops, the anchored "other" substr exists, so we still
1214 * contradict. On the other hand, the float "check" substr
1215 * didn't contradict, so just retry the anchored "other"
1217 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1218 " Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
1219 PL_colors[0], PL_colors[1],
1220 (long)(rx_origin - strbeg + prog->anchored_offset),
1221 (long)(rx_origin - strbeg)
1223 goto do_other_substr;
1226 /* success: we don't contradict the found floating substring
1227 * (and there's no anchored substr). */
1228 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1229 " Found /%s^%s/m with rx_origin %ld...\n",
1230 PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
1233 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1234 " (multiline anchor test skipped)\n"));
1240 /* if we have a starting character class, then test that extra constraint.
1241 * (trie stclasses are too expensive to use here, we are better off to
1242 * leave it to regmatch itself) */
1244 if (progi->regstclass && PL_regkind[OP(progi->regstclass)]!=TRIE) {
1245 const U8* const str = (U8*)STRING(progi->regstclass);
1247 /* XXX this value could be pre-computed */
1248 const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT
1249 ? (reginfo->is_utf8_pat
1250 ? utf8_distance(str + STR_LEN(progi->regstclass), str)
1251 : STR_LEN(progi->regstclass))
1255 /* latest pos that a matching float substr constrains rx start to */
1256 char *rx_max_float = NULL;
1258 /* if the current rx_origin is anchored, either by satisfying an
1259 * anchored substring constraint, or a /^.../m constraint, then we
1260 * can reject the current origin if the start class isn't found
1261 * at the current position. If we have a float-only match, then
1262 * rx_origin is constrained to a range; so look for the start class
1263 * in that range. if neither, then look for the start class in the
1264 * whole rest of the string */
1266 /* XXX DAPM it's not clear what the minlen test is for, and why
1267 * it's not used in the floating case. Nothing in the test suite
1268 * causes minlen == 0 here. See <20140313134639.GS12844@iabyn.com>.
1269 * Here are some old comments, which may or may not be correct:
1271 * minlen == 0 is possible if regstclass is \b or \B,
1272 * and the fixed substr is ''$.
1273 * Since minlen is already taken into account, rx_origin+1 is
1274 * before strend; accidentally, minlen >= 1 guaranties no false
1275 * positives at rx_origin + 1 even for \b or \B. But (minlen? 1 :
1276 * 0) below assumes that regstclass does not come from lookahead...
1277 * If regstclass takes bytelength more than 1: If charlength==1, OK.
1278 * This leaves EXACTF-ish only, which are dealt with in
1282 if (prog->anchored_substr || prog->anchored_utf8 || ml_anch)
1283 endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend);
1284 else if (prog->float_substr || prog->float_utf8) {
1285 rx_max_float = HOP3c(check_at, -start_shift, strbeg);
1286 endpos= HOP3c(rx_max_float, cl_l, strend);
1291 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1292 " looking for class: start_shift: %"IVdf" check_at: %"IVdf
1293 " rx_origin: %"IVdf" endpos: %"IVdf"\n",
1294 (IV)start_shift, (IV)(check_at - strbeg),
1295 (IV)(rx_origin - strbeg), (IV)(endpos - strbeg)));
1297 s = find_byclass(prog, progi->regstclass, rx_origin, endpos,
1300 if (endpos == strend) {
1301 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1302 " Could not match STCLASS...\n") );
1305 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1306 " This position contradicts STCLASS...\n") );
1307 if ((prog->intflags & PREGf_ANCH) && !ml_anch
1308 && !(prog->intflags & PREGf_IMPLICIT))
1311 /* Contradict one of substrings */
1312 if (prog->anchored_substr || prog->anchored_utf8) {
1313 if (prog->substrs->check_ix == 1) { /* check is float */
1314 /* Have both, check_string is floating */
1315 assert(rx_origin + start_shift <= check_at);
1316 if (rx_origin + start_shift != check_at) {
1317 /* not at latest position float substr could match:
1318 * Recheck anchored substring, but not floating.
1319 * The condition above is in bytes rather than
1320 * chars for efficiency. It's conservative, in
1321 * that it errs on the side of doing 'goto
1322 * do_other_substr'. In this case, at worst,
1323 * an extra anchored search may get done, but in
1324 * practice the extra fbm_instr() is likely to
1325 * get skipped anyway. */
1326 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1327 " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
1328 (long)(other_last - strbeg),
1329 (IV)(rx_origin - strbeg)
1331 goto do_other_substr;
1339 /* In the presence of ml_anch, we might be able to
1340 * find another \n without breaking the current float
1343 /* strictly speaking this should be HOP3c(..., 1, ...),
1344 * but since we goto a block of code that's going to
1345 * search for the next \n if any, its safe here */
1347 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1348 " about to look for /%s^%s/m starting at rx_origin %ld...\n",
1349 PL_colors[0], PL_colors[1],
1350 (long)(rx_origin - strbeg)) );
1351 goto postprocess_substr_matches;
1354 /* strictly speaking this can never be true; but might
1355 * be if we ever allow intuit without substrings */
1356 if (!(utf8_target ? prog->float_utf8 : prog->float_substr))
1359 rx_origin = rx_max_float;
1362 /* at this point, any matching substrings have been
1363 * contradicted. Start again... */
1365 rx_origin = HOP3c(rx_origin, 1, strend);
1367 /* uses bytes rather than char calculations for efficiency.
1368 * It's conservative: it errs on the side of doing 'goto restart',
1369 * where there is code that does a proper char-based test */
1370 if (rx_origin + start_shift + end_shift > strend) {
1371 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1372 " Could not match STCLASS...\n") );
1375 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
1376 " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
1377 (prog->substrs->check_ix ? "floating" : "anchored"),
1378 (long)(rx_origin + start_shift - strbeg),
1379 (IV)(rx_origin - strbeg)
1386 if (rx_origin != s) {
1387 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1388 " By STCLASS: moving %ld --> %ld\n",
1389 (long)(rx_origin - strbeg), (long)(s - strbeg))
1393 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1394 " Does not contradict STCLASS...\n");
1399 /* Decide whether using the substrings helped */
1401 if (rx_origin != strpos) {
1402 /* Fixed substring is found far enough so that the match
1403 cannot start at strpos. */
1405 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " try at offset...\n"));
1406 ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */
1409 /* The found rx_origin position does not prohibit matching at
1410 * strpos, so calling intuit didn't gain us anything. Decrement
1411 * the BmUSEFUL() count on the check substring, and if we reach
1413 if (!(prog->intflags & PREGf_NAUGHTY)
1415 prog->check_utf8 /* Could be deleted already */
1416 && --BmUSEFUL(prog->check_utf8) < 0
1417 && (prog->check_utf8 == prog->float_utf8)
1419 prog->check_substr /* Could be deleted already */
1420 && --BmUSEFUL(prog->check_substr) < 0
1421 && (prog->check_substr == prog->float_substr)
1424 /* If flags & SOMETHING - do not do it many times on the same match */
1425 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " ... Disabling check substring...\n"));
1426 /* XXX Does the destruction order has to change with utf8_target? */
1427 SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr);
1428 SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8);
1429 prog->check_substr = prog->check_utf8 = NULL; /* disable */
1430 prog->float_substr = prog->float_utf8 = NULL; /* clear */
1431 check = NULL; /* abort */
1432 /* XXXX This is a remnant of the old implementation. It
1433 looks wasteful, since now INTUIT can use many
1434 other heuristics. */
1435 prog->extflags &= ~RXf_USE_INTUIT;
1439 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
1440 "Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
1441 PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
1445 fail_finish: /* Substring not found */
1446 if (prog->check_substr || prog->check_utf8) /* could be removed already */
1447 BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */
1449 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n",
1450 PL_colors[4], PL_colors[5]));
1455 #define DECL_TRIE_TYPE(scan) \
1456 const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
1457 trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
1458 trie_utf8l, trie_flu8 } \
1459 trie_type = ((scan->flags == EXACT) \
1460 ? (utf8_target ? trie_utf8 : trie_plain) \
1461 : (scan->flags == EXACTL) \
1462 ? (utf8_target ? trie_utf8l : trie_plain) \
1463 : (scan->flags == EXACTFA) \
1465 ? trie_utf8_exactfa_fold \
1466 : trie_latin_utf8_exactfa_fold) \
1467 : (scan->flags == EXACTFLU8 \
1471 : trie_latin_utf8_fold)))
1473 #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
1476 U8 flags = FOLD_FLAGS_FULL; \
1477 switch (trie_type) { \
1479 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1480 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1481 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1483 goto do_trie_utf8_fold; \
1484 case trie_utf8_exactfa_fold: \
1485 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1487 case trie_utf8_fold: \
1488 do_trie_utf8_fold: \
1489 if ( foldlen>0 ) { \
1490 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1495 uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
1496 len = UTF8SKIP(uc); \
1497 skiplen = UNISKIP( uvc ); \
1498 foldlen -= skiplen; \
1499 uscan = foldbuf + skiplen; \
1502 case trie_latin_utf8_exactfa_fold: \
1503 flags |= FOLD_FLAGS_NOMIX_ASCII; \
1505 case trie_latin_utf8_fold: \
1506 if ( foldlen>0 ) { \
1507 uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
1513 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
1514 skiplen = UNISKIP( uvc ); \
1515 foldlen -= skiplen; \
1516 uscan = foldbuf + skiplen; \
1520 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
1521 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
1522 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
1526 uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
1533 charid = trie->charmap[ uvc ]; \
1537 if (widecharmap) { \
1538 SV** const svpp = hv_fetch(widecharmap, \
1539 (char*)&uvc, sizeof(UV), 0); \
1541 charid = (U16)SvIV(*svpp); \
1546 #define DUMP_EXEC_POS(li,s,doutf8) \
1547 dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \
1550 #define REXEC_FBC_EXACTISH_SCAN(COND) \
1554 && (ln == 1 || folder(s, pat_string, ln)) \
1555 && (reginfo->intuit || regtry(reginfo, &s)) )\
1561 #define REXEC_FBC_UTF8_SCAN(CODE) \
1563 while (s < strend) { \
1569 #define REXEC_FBC_SCAN(CODE) \
1571 while (s < strend) { \
1577 #define REXEC_FBC_UTF8_CLASS_SCAN(COND) \
1578 REXEC_FBC_UTF8_SCAN( /* Loops while (s < strend) */ \
1580 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1589 #define REXEC_FBC_CLASS_SCAN(COND) \
1590 REXEC_FBC_SCAN( /* Loops while (s < strend) */ \
1592 if (tmp && (reginfo->intuit || regtry(reginfo, &s))) \
1601 #define REXEC_FBC_CSCAN(CONDUTF8,COND) \
1602 if (utf8_target) { \
1603 REXEC_FBC_UTF8_CLASS_SCAN(CONDUTF8); \
1606 REXEC_FBC_CLASS_SCAN(COND); \
1609 /* The three macros below are slightly different versions of the same logic.
1611 * The first is for /a and /aa when the target string is UTF-8. This can only
1612 * match ascii, but it must advance based on UTF-8. The other two handle the
1613 * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking
1614 * for the boundary (or non-boundary) between a word and non-word character.
1615 * The utf8 and non-utf8 cases have the same logic, but the details must be
1616 * different. Find the "wordness" of the character just prior to this one, and
1617 * compare it with the wordness of this one. If they differ, we have a
1618 * boundary. At the beginning of the string, pretend that the previous
1619 * character was a new-line.
1621 * All these macros uncleanly have side-effects with each other and outside
1622 * variables. So far it's been too much trouble to clean-up
1624 * TEST_NON_UTF8 is the macro or function to call to test if its byte input is
1625 * a word character or not.
1626 * IF_SUCCESS is code to do if it finds that we are at a boundary between
1628 * IF_FAIL is code to do if we aren't at a boundary between word/non-word
1630 * Exactly one of the two IF_FOO parameters is a no-op, depending on whether we
1631 * are looking for a boundary or for a non-boundary. If we are looking for a
1632 * boundary, we want IF_FAIL to be the no-op, and for IF_SUCCESS to go out and
1633 * see if this tentative match actually works, and if so, to quit the loop
1634 * here. And vice-versa if we are looking for a non-boundary.
1636 * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and
1637 * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of
1638 * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be
1639 * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal
1640 * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that
1641 * complement. But in that branch we complement tmp, meaning that at the
1642 * bottom of the loop tmp is always going to be equal to TEST_NON_UTF8(s),
1643 * which means at the top of the loop in the next iteration, it is
1644 * TEST_NON_UTF8(s-1) */
1645 #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1646 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1647 tmp = TEST_NON_UTF8(tmp); \
1648 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1649 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1651 IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \
1658 /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and
1659 * TEST_UTF8 is a macro that for the same input code points returns identically
1660 * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */
1661 #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \
1662 if (s == reginfo->strbeg) { \
1665 else { /* Back-up to the start of the previous character */ \
1666 U8 * const r = reghop3((U8*)s, -1, (U8*)reginfo->strbeg); \
1667 tmp = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, \
1668 0, UTF8_ALLOW_DEFAULT); \
1670 tmp = TEST_UV(tmp); \
1671 LOAD_UTF8_CHARCLASS_ALNUM(); \
1672 REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \
1673 if (tmp == ! (TEST_UTF8((U8 *) s))) { \
1682 /* Like the above two macros. UTF8_CODE is the complete code for handling
1683 * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc
1685 #define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \
1686 if (utf8_target) { \
1689 else { /* Not utf8 */ \
1690 tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \
1691 tmp = TEST_NON_UTF8(tmp); \
1692 REXEC_FBC_SCAN( /* advances s while s < strend */ \
1693 if (tmp == ! TEST_NON_UTF8((U8) *s)) { \
1702 /* Here, things have been set up by the previous code so that tmp is the \
1703 * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \
1704 * utf8ness of the target). We also have to check if this matches against \
1705 * the EOS, which we treat as a \n (which is the same value in both UTF-8 \
1706 * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \
1708 if (tmp == ! TEST_NON_UTF8('\n')) { \
1715 /* This is the macro to use when we want to see if something that looks like it
1716 * could match, actually does, and if so exits the loop */
1717 #define REXEC_FBC_TRYIT \
1718 if ((reginfo->intuit || regtry(reginfo, &s))) \
1721 /* The only difference between the BOUND and NBOUND cases is that
1722 * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in
1723 * NBOUND. This is accomplished by passing it as either the if or else clause,
1724 * with the other one being empty (PLACEHOLDER is defined as empty).
1726 * The TEST_FOO parameters are for operating on different forms of input, but
1727 * all should be ones that return identically for the same underlying code
1729 #define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1731 FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1732 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1734 #define FBC_BOUND_A(TEST_NON_UTF8) \
1736 FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \
1737 TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER)
1739 #define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \
1741 FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1742 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1744 #define FBC_NBOUND_A(TEST_NON_UTF8) \
1746 FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \
1747 TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT)
1749 /* Takes a pointer to an inversion list, a pointer to its corresponding
1750 * inversion map, and a code point, and returns the code point's value
1751 * according to the two arrays. It assumes that all code points have a value.
1752 * This is used as the base macro for macros for particular properties */
1753 #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \
1754 invmap[_invlist_search(invlist, cp)]
1756 /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead
1757 * of a code point, returning the value for the first code point in the string.
1758 * And it takes the particular macro name that finds the desired value given a
1759 * code point. Merely convert the UTF-8 to code point and call the cp macro */
1760 #define _generic_GET_BREAK_VAL_UTF8(cp_macro, pos, strend) \
1761 (__ASSERT_(pos < strend) \
1762 /* Note assumes is valid UTF-8 */ \
1763 (cp_macro(utf8_to_uvchr_buf((pos), (strend), NULL))))
1765 /* Returns the GCB value for the input code point */
1766 #define getGCB_VAL_CP(cp) \
1767 _generic_GET_BREAK_VAL_CP( \
1769 Grapheme_Cluster_Break_invmap, \
1772 /* Returns the GCB value for the first code point in the UTF-8 encoded string
1773 * bounded by pos and strend */
1774 #define getGCB_VAL_UTF8(pos, strend) \
1775 _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
1778 /* Returns the SB value for the input code point */
1779 #define getSB_VAL_CP(cp) \
1780 _generic_GET_BREAK_VAL_CP( \
1782 Sentence_Break_invmap, \
1785 /* Returns the SB value for the first code point in the UTF-8 encoded string
1786 * bounded by pos and strend */
1787 #define getSB_VAL_UTF8(pos, strend) \
1788 _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
1790 /* Returns the WB value for the input code point */
1791 #define getWB_VAL_CP(cp) \
1792 _generic_GET_BREAK_VAL_CP( \
1794 Word_Break_invmap, \
1797 /* Returns the WB value for the first code point in the UTF-8 encoded string
1798 * bounded by pos and strend */
1799 #define getWB_VAL_UTF8(pos, strend) \
1800 _generic_GET_BREAK_VAL_UTF8(getWB_VAL_CP, pos, strend)
1802 /* We know what class REx starts with. Try to find this position... */
1803 /* if reginfo->intuit, its a dryrun */
1804 /* annoyingly all the vars in this routine have different names from their counterparts
1805 in regmatch. /grrr */
1807 S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
1808 const char *strend, regmatch_info *reginfo)
1811 const I32 doevery = (prog->intflags & PREGf_SKIP) == 0;
1812 char *pat_string; /* The pattern's exactish string */
1813 char *pat_end; /* ptr to end char of pat_string */
1814 re_fold_t folder; /* Function for computing non-utf8 folds */
1815 const U8 *fold_array; /* array for folding ords < 256 */
1821 I32 tmp = 1; /* Scratch variable? */
1822 const bool utf8_target = reginfo->is_utf8_target;
1823 UV utf8_fold_flags = 0;
1824 const bool is_utf8_pat = reginfo->is_utf8_pat;
1825 bool to_complement = FALSE; /* Invert the result? Taking the xor of this
1826 with a result inverts that result, as 0^1 =
1828 _char_class_number classnum;
1830 RXi_GET_DECL(prog,progi);
1832 PERL_ARGS_ASSERT_FIND_BYCLASS;
1834 /* We know what class it must start with. */
1837 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1841 REXEC_FBC_UTF8_CLASS_SCAN(
1842 reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target));
1845 REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
1850 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
1857 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
1858 assert(! is_utf8_pat);
1861 if (is_utf8_pat || utf8_target) {
1862 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
1863 goto do_exactf_utf8;
1865 fold_array = PL_fold_latin1; /* Latin1 folds are not affected by */
1866 folder = foldEQ_latin1; /* /a, except the sharp s one which */
1867 goto do_exactf_non_utf8; /* isn't dealt with by these */
1869 case EXACTF: /* This node only generated for non-utf8 patterns */
1870 assert(! is_utf8_pat);
1872 utf8_fold_flags = 0;
1873 goto do_exactf_utf8;
1875 fold_array = PL_fold;
1877 goto do_exactf_non_utf8;
1880 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
1881 if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
1882 utf8_fold_flags = FOLDEQ_LOCALE;
1883 goto do_exactf_utf8;
1885 fold_array = PL_fold_locale;
1886 folder = foldEQ_locale;
1887 goto do_exactf_non_utf8;
1891 utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
1893 goto do_exactf_utf8;
1896 if (! utf8_target) { /* All code points in this node require
1897 UTF-8 to express. */
1900 utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
1901 | FOLDEQ_S2_FOLDS_SANE;
1902 goto do_exactf_utf8;
1905 if (is_utf8_pat || utf8_target) {
1906 utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
1907 goto do_exactf_utf8;
1910 /* Any 'ss' in the pattern should have been replaced by regcomp,
1911 * so we don't have to worry here about this single special case
1912 * in the Latin1 range */
1913 fold_array = PL_fold_latin1;
1914 folder = foldEQ_latin1;
1918 do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there
1919 are no glitches with fold-length differences
1920 between the target string and pattern */
1922 /* The idea in the non-utf8 EXACTF* cases is to first find the
1923 * first character of the EXACTF* node and then, if necessary,
1924 * case-insensitively compare the full text of the node. c1 is the
1925 * first character. c2 is its fold. This logic will not work for
1926 * Unicode semantics and the german sharp ss, which hence should
1927 * not be compiled into a node that gets here. */
1928 pat_string = STRING(c);
1929 ln = STR_LEN(c); /* length to match in octets/bytes */
1931 /* We know that we have to match at least 'ln' bytes (which is the
1932 * same as characters, since not utf8). If we have to match 3
1933 * characters, and there are only 2 availabe, we know without
1934 * trying that it will fail; so don't start a match past the
1935 * required minimum number from the far end */
1936 e = HOP3c(strend, -((SSize_t)ln), s);
1938 if (reginfo->intuit && e < s) {
1939 e = s; /* Due to minlen logic of intuit() */
1943 c2 = fold_array[c1];
1944 if (c1 == c2) { /* If char and fold are the same */
1945 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1);
1948 REXEC_FBC_EXACTISH_SCAN(*(U8*)s == c1 || *(U8*)s == c2);
1956 /* If one of the operands is in utf8, we can't use the simpler folding
1957 * above, due to the fact that many different characters can have the
1958 * same fold, or portion of a fold, or different- length fold */
1959 pat_string = STRING(c);
1960 ln = STR_LEN(c); /* length to match in octets/bytes */
1961 pat_end = pat_string + ln;
1962 lnc = is_utf8_pat /* length to match in characters */
1963 ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
1966 /* We have 'lnc' characters to match in the pattern, but because of
1967 * multi-character folding, each character in the target can match
1968 * up to 3 characters (Unicode guarantees it will never exceed
1969 * this) if it is utf8-encoded; and up to 2 if not (based on the
1970 * fact that the Latin 1 folds are already determined, and the
1971 * only multi-char fold in that range is the sharp-s folding to
1972 * 'ss'. Thus, a pattern character can match as little as 1/3 of a
1973 * string character. Adjust lnc accordingly, rounding up, so that
1974 * if we need to match at least 4+1/3 chars, that really is 5. */
1975 expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
1976 lnc = (lnc + expansion - 1) / expansion;
1978 /* As in the non-UTF8 case, if we have to match 3 characters, and
1979 * only 2 are left, it's guaranteed to fail, so don't start a
1980 * match that would require us to go beyond the end of the string
1982 e = HOP3c(strend, -((SSize_t)lnc), s);
1984 if (reginfo->intuit && e < s) {
1985 e = s; /* Due to minlen logic of intuit() */
1988 /* XXX Note that we could recalculate e to stop the loop earlier,
1989 * as the worst case expansion above will rarely be met, and as we
1990 * go along we would usually find that e moves further to the left.
1991 * This would happen only after we reached the point in the loop
1992 * where if there were no expansion we should fail. Unclear if
1993 * worth the expense */
1996 char *my_strend= (char *)strend;
1997 if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target,
1998 pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags)
1999 && (reginfo->intuit || regtry(reginfo, &s)) )
2003 s += (utf8_target) ? UTF8SKIP(s) : 1;
2009 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2010 if (FLAGS(c) != TRADITIONAL_BOUND) {
2011 if (! IN_UTF8_CTYPE_LOCALE) {
2012 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2013 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2018 FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2022 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2023 if (FLAGS(c) != TRADITIONAL_BOUND) {
2024 if (! IN_UTF8_CTYPE_LOCALE) {
2025 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2026 B_ON_NON_UTF8_LOCALE_IS_WRONG);
2031 FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
2034 case BOUND: /* regcomp.c makes sure that this only has the traditional \b
2036 assert(FLAGS(c) == TRADITIONAL_BOUND);
2038 FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2041 case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2043 assert(FLAGS(c) == TRADITIONAL_BOUND);
2045 FBC_BOUND_A(isWORDCHAR_A);
2048 case NBOUND: /* regcomp.c makes sure that this only has the traditional \b
2050 assert(FLAGS(c) == TRADITIONAL_BOUND);
2052 FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
2055 case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
2057 assert(FLAGS(c) == TRADITIONAL_BOUND);
2059 FBC_NBOUND_A(isWORDCHAR_A);
2063 if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
2064 FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2075 switch((bound_type) FLAGS(c)) {
2076 case TRADITIONAL_BOUND:
2077 FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
2080 if (s == reginfo->strbeg) { /* GCB always matches at begin and
2082 if (to_complement ^ cBOOL(reginfo->intuit
2083 || regtry(reginfo, &s)))
2087 s += (utf8_target) ? UTF8SKIP(s) : 1;
2091 GCB_enum before = getGCB_VAL_UTF8(
2093 (U8*)(reginfo->strbeg)),
2094 (U8*) reginfo->strend);
2095 while (s < strend) {
2096 GCB_enum after = getGCB_VAL_UTF8((U8*) s,
2097 (U8*) reginfo->strend);
2098 if (to_complement ^ isGCB(before, after)) {
2099 if (reginfo->intuit || regtry(reginfo, &s)) {
2107 else { /* Not utf8. Everything is a GCB except between CR and
2109 while (s < strend) {
2110 if (to_complement ^ (UCHARAT(s - 1) != '\r'
2111 || UCHARAT(s) != '\n'))
2113 if (reginfo->intuit || regtry(reginfo, &s)) {
2121 if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
2127 if (s == reginfo->strbeg) { /* SB always matches at beginning */
2129 ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
2134 /* Didn't match. Go try at the next position */
2135 s += (utf8_target) ? UTF8SKIP(s) : 1;
2139 SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
2141 (U8*)(reginfo->strbeg)),
2142 (U8*) reginfo->strend);
2143 while (s < strend) {
2144 SB_enum after = getSB_VAL_UTF8((U8*) s,
2145 (U8*) reginfo->strend);
2146 if (to_complement ^ isSB(before,
2148 (U8*) reginfo->strbeg,
2150 (U8*) reginfo->strend,
2153 if (reginfo->intuit || regtry(reginfo, &s)) {
2161 else { /* Not utf8. */
2162 SB_enum before = getSB_VAL_CP((U8) *(s -1));
2163 while (s < strend) {
2164 SB_enum after = getSB_VAL_CP((U8) *s);
2165 if (to_complement ^ isSB(before,
2167 (U8*) reginfo->strbeg,
2169 (U8*) reginfo->strend,
2172 if (reginfo->intuit || regtry(reginfo, &s)) {
2181 /* Here are at the final position in the target string. The SB
2182 * value is always true here, so matches, depending on other
2184 if (to_complement ^ cBOOL(reginfo->intuit
2185 || regtry(reginfo, &s)))
2193 if (s == reginfo->strbeg) {
2194 if (to_complement ^ cBOOL(reginfo->intuit
2195 || regtry(reginfo, &s)))
2199 s += (utf8_target) ? UTF8SKIP(s) : 1;
2203 /* We are at a boundary between char_sub_0 and char_sub_1.
2204 * We also keep track of the value for char_sub_-1 as we
2205 * loop through the line. Context may be needed to make a
2206 * determination, and if so, this can save having to
2208 WB_enum previous = WB_UNKNOWN;
2209 WB_enum before = getWB_VAL_UTF8(
2212 (U8*)(reginfo->strbeg)),
2213 (U8*) reginfo->strend);
2214 while (s < strend) {
2215 WB_enum after = getWB_VAL_UTF8((U8*) s,
2216 (U8*) reginfo->strend);
2217 if (to_complement ^ isWB(previous,
2220 (U8*) reginfo->strbeg,
2222 (U8*) reginfo->strend,
2225 if (reginfo->intuit || regtry(reginfo, &s)) {
2234 else { /* Not utf8. */
2235 WB_enum previous = WB_UNKNOWN;
2236 WB_enum before = getWB_VAL_CP((U8) *(s -1));
2237 while (s < strend) {
2238 WB_enum after = getWB_VAL_CP((U8) *s);
2239 if (to_complement ^ isWB(previous,
2242 (U8*) reginfo->strbeg,
2244 (U8*) reginfo->strend,
2247 if (reginfo->intuit || regtry(reginfo, &s)) {
2257 if (to_complement ^ cBOOL(reginfo->intuit
2258 || regtry(reginfo, &s)))
2268 REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend),
2269 is_LNBREAK_latin1_safe(s, strend)
2273 /* The argument to all the POSIX node types is the class number to pass to
2274 * _generic_isCC() to build a mask for searching in PL_charclass[] */
2281 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2282 REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
2283 to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
2298 /* The complement of something that matches only ASCII matches all
2299 * non-ASCII, plus everything in ASCII that isn't in the class. */
2300 REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
2301 || ! _generic_isCC_A(*s, FLAGS(c)));
2310 /* Don't need to worry about utf8, as it can match only a single
2311 * byte invariant character. */
2312 REXEC_FBC_CLASS_SCAN(
2313 to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c))));
2321 if (! utf8_target) {
2322 REXEC_FBC_CLASS_SCAN(to_complement ^ cBOOL(_generic_isCC(*s,
2328 classnum = (_char_class_number) FLAGS(c);
2329 if (classnum < _FIRST_NON_SWASH_CC) {
2330 while (s < strend) {
2332 /* We avoid loading in the swash as long as possible, but
2333 * should we have to, we jump to a separate loop. This
2334 * extra 'if' statement is what keeps this code from being
2335 * just a call to REXEC_FBC_UTF8_CLASS_SCAN() */
2336 if (UTF8_IS_ABOVE_LATIN1(*s)) {
2337 goto found_above_latin1;
2339 if ((UTF8_IS_INVARIANT(*s)
2340 && to_complement ^ cBOOL(_generic_isCC((U8) *s,
2342 || (UTF8_IS_DOWNGRADEABLE_START(*s)
2343 && to_complement ^ cBOOL(
2344 _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
2348 if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
2360 else switch (classnum) { /* These classes are implemented as
2362 case _CC_ENUM_SPACE:
2363 REXEC_FBC_UTF8_CLASS_SCAN(
2364 to_complement ^ cBOOL(isSPACE_utf8(s)));
2367 case _CC_ENUM_BLANK:
2368 REXEC_FBC_UTF8_CLASS_SCAN(
2369 to_complement ^ cBOOL(isBLANK_utf8(s)));
2372 case _CC_ENUM_XDIGIT:
2373 REXEC_FBC_UTF8_CLASS_SCAN(
2374 to_complement ^ cBOOL(isXDIGIT_utf8(s)));
2377 case _CC_ENUM_VERTSPACE:
2378 REXEC_FBC_UTF8_CLASS_SCAN(
2379 to_complement ^ cBOOL(isVERTWS_utf8(s)));
2382 case _CC_ENUM_CNTRL:
2383 REXEC_FBC_UTF8_CLASS_SCAN(
2384 to_complement ^ cBOOL(isCNTRL_utf8(s)));
2388 Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
2389 NOT_REACHED; /* NOTREACHED */
2394 found_above_latin1: /* Here we have to load a swash to get the result
2395 for the current code point */
2396 if (! PL_utf8_swash_ptrs[classnum]) {
2397 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2398 PL_utf8_swash_ptrs[classnum] =
2399 _core_swash_init("utf8",
2402 PL_XPosix_ptrs[classnum], &flags);
2405 /* This is a copy of the loop above for swash classes, though using the
2406 * FBC macro instead of being expanded out. Since we've loaded the
2407 * swash, we don't have to check for that each time through the loop */
2408 REXEC_FBC_UTF8_CLASS_SCAN(
2409 to_complement ^ cBOOL(_generic_utf8(
2412 swash_fetch(PL_utf8_swash_ptrs[classnum],
2420 /* what trie are we using right now */
2421 reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ];
2422 reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ];
2423 HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]);
2425 const char *last_start = strend - trie->minlen;
2427 const char *real_start = s;
2429 STRLEN maxlen = trie->maxlen;
2431 U8 **points; /* map of where we were in the input string
2432 when reading a given char. For ASCII this
2433 is unnecessary overhead as the relationship
2434 is always 1:1, but for Unicode, especially
2435 case folded Unicode this is not true. */
2436 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2440 GET_RE_DEBUG_FLAGS_DECL;
2442 /* We can't just allocate points here. We need to wrap it in
2443 * an SV so it gets freed properly if there is a croak while
2444 * running the match */
2447 sv_points=newSV(maxlen * sizeof(U8 *));
2448 SvCUR_set(sv_points,
2449 maxlen * sizeof(U8 *));
2450 SvPOK_on(sv_points);
2451 sv_2mortal(sv_points);
2452 points=(U8**)SvPV_nolen(sv_points );
2453 if ( trie_type != trie_utf8_fold
2454 && (trie->bitmap || OP(c)==AHOCORASICKC) )
2457 bitmap=(U8*)trie->bitmap;
2459 bitmap=(U8*)ANYOF_BITMAP(c);
2461 /* this is the Aho-Corasick algorithm modified a touch
2462 to include special handling for long "unknown char" sequences.
2463 The basic idea being that we use AC as long as we are dealing
2464 with a possible matching char, when we encounter an unknown char
2465 (and we have not encountered an accepting state) we scan forward
2466 until we find a legal starting char.
2467 AC matching is basically that of trie matching, except that when
2468 we encounter a failing transition, we fall back to the current
2469 states "fail state", and try the current char again, a process
2470 we repeat until we reach the root state, state 1, or a legal
2471 transition. If we fail on the root state then we can either
2472 terminate if we have reached an accepting state previously, or
2473 restart the entire process from the beginning if we have not.
2476 while (s <= last_start) {
2477 const U32 uniflags = UTF8_ALLOW_DEFAULT;
2485 U8 *uscan = (U8*)NULL;
2486 U8 *leftmost = NULL;
2488 U32 accepted_word= 0;
2492 while ( state && uc <= (U8*)strend ) {
2494 U32 word = aho->states[ state ].wordnum;
2498 DEBUG_TRIE_EXECUTE_r(
2499 if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2500 dump_exec_pos( (char *)uc, c, strend, real_start,
2501 (char *)uc, utf8_target );
2502 PerlIO_printf( Perl_debug_log,
2503 " Scanning for legal start char...\n");
2507 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2511 while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) {
2517 if (uc >(U8*)last_start) break;
2521 U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ];
2522 if (!leftmost || lpos < leftmost) {
2523 DEBUG_r(accepted_word=word);
2529 points[pointpos++ % maxlen]= uc;
2530 if (foldlen || uc < (U8*)strend) {
2531 REXEC_TRIE_READ_CHAR(trie_type, trie,
2533 uscan, len, uvc, charid, foldlen,
2535 DEBUG_TRIE_EXECUTE_r({
2536 dump_exec_pos( (char *)uc, c, strend,
2537 real_start, s, utf8_target);
2538 PerlIO_printf(Perl_debug_log,
2539 " Charid:%3u CP:%4"UVxf" ",
2551 word = aho->states[ state ].wordnum;
2553 base = aho->states[ state ].trans.base;
2555 DEBUG_TRIE_EXECUTE_r({
2557 dump_exec_pos( (char *)uc, c, strend, real_start,
2559 PerlIO_printf( Perl_debug_log,
2560 "%sState: %4"UVxf", word=%"UVxf,
2561 failed ? " Fail transition to " : "",
2562 (UV)state, (UV)word);
2568 ( ((offset = base + charid
2569 - 1 - trie->uniquecharcount)) >= 0)
2570 && ((U32)offset < trie->lasttrans)
2571 && trie->trans[offset].check == state
2572 && (tmp=trie->trans[offset].next))
2574 DEBUG_TRIE_EXECUTE_r(
2575 PerlIO_printf( Perl_debug_log," - legal\n"));
2580 DEBUG_TRIE_EXECUTE_r(
2581 PerlIO_printf( Perl_debug_log," - fail\n"));
2583 state = aho->fail[state];
2587 /* we must be accepting here */
2588 DEBUG_TRIE_EXECUTE_r(
2589 PerlIO_printf( Perl_debug_log," - accepting\n"));
2598 if (!state) state = 1;
2601 if ( aho->states[ state ].wordnum ) {
2602 U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ];
2603 if (!leftmost || lpos < leftmost) {
2604 DEBUG_r(accepted_word=aho->states[ state ].wordnum);
2609 s = (char*)leftmost;
2610 DEBUG_TRIE_EXECUTE_r({
2612 Perl_debug_log,"Matches word #%"UVxf" at position %"IVdf". Trying full pattern...\n",
2613 (UV)accepted_word, (IV)(s - real_start)
2616 if (reginfo->intuit || regtry(reginfo, &s)) {
2622 DEBUG_TRIE_EXECUTE_r({
2623 PerlIO_printf( Perl_debug_log,"Pattern failed. Looking for new start point...\n");
2626 DEBUG_TRIE_EXECUTE_r(
2627 PerlIO_printf( Perl_debug_log,"No match.\n"));
2636 Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c));
2643 /* set RX_SAVED_COPY, RX_SUBBEG etc.
2644 * flags have same meanings as with regexec_flags() */
2647 S_reg_set_capture_string(pTHX_ REGEXP * const rx,
2654 struct regexp *const prog = ReANY(rx);
2656 if (flags & REXEC_COPY_STR) {
2660 PerlIO_printf(Perl_debug_log,
2661 "Copy on write: regexp capture, type %d\n",
2664 /* Create a new COW SV to share the match string and store
2665 * in saved_copy, unless the current COW SV in saved_copy
2666 * is valid and suitable for our purpose */
2667 if (( prog->saved_copy
2668 && SvIsCOW(prog->saved_copy)
2669 && SvPOKp(prog->saved_copy)
2672 && SvPVX(sv) == SvPVX(prog->saved_copy)))
2674 /* just reuse saved_copy SV */
2675 if (RXp_MATCH_COPIED(prog)) {
2676 Safefree(prog->subbeg);
2677 RXp_MATCH_COPIED_off(prog);
2681 /* create new COW SV to share string */
2682 RX_MATCH_COPY_FREE(rx);
2683 prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv);
2685 prog->subbeg = (char *)SvPVX_const(prog->saved_copy);
2686 assert (SvPOKp(prog->saved_copy));
2687 prog->sublen = strend - strbeg;
2688 prog->suboffset = 0;
2689 prog->subcoffset = 0;
2694 SSize_t max = strend - strbeg;
2697 if ( (flags & REXEC_COPY_SKIP_POST)
2698 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2699 && !(PL_sawampersand & SAWAMPERSAND_RIGHT)
2700 ) { /* don't copy $' part of string */
2703 /* calculate the right-most part of the string covered
2704 * by a capture. Due to look-ahead, this may be to
2705 * the right of $&, so we have to scan all captures */
2706 while (n <= prog->lastparen) {
2707 if (prog->offs[n].end > max)
2708 max = prog->offs[n].end;
2712 max = (PL_sawampersand & SAWAMPERSAND_LEFT)
2713 ? prog->offs[0].start
2715 assert(max >= 0 && max <= strend - strbeg);
2718 if ( (flags & REXEC_COPY_SKIP_PRE)
2719 && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */
2720 && !(PL_sawampersand & SAWAMPERSAND_LEFT)
2721 ) { /* don't copy $` part of string */
2724 /* calculate the left-most part of the string covered
2725 * by a capture. Due to look-behind, this may be to
2726 * the left of $&, so we have to scan all captures */
2727 while (min && n <= prog->lastparen) {
2728 if ( prog->offs[n].start != -1
2729 && prog->offs[n].start < min)
2731 min = prog->offs[n].start;
2735 if ((PL_sawampersand & SAWAMPERSAND_RIGHT)
2736 && min > prog->offs[0].end
2738 min = prog->offs[0].end;
2742 assert(min >= 0 && min <= max && min <= strend - strbeg);
2745 if (RX_MATCH_COPIED(rx)) {
2746 if (sublen > prog->sublen)
2748 (char*)saferealloc(prog->subbeg, sublen+1);
2751 prog->subbeg = (char*)safemalloc(sublen+1);
2752 Copy(strbeg + min, prog->subbeg, sublen, char);
2753 prog->subbeg[sublen] = '\0';
2754 prog->suboffset = min;
2755 prog->sublen = sublen;
2756 RX_MATCH_COPIED_on(rx);
2758 prog->subcoffset = prog->suboffset;
2759 if (prog->suboffset && utf8_target) {
2760 /* Convert byte offset to chars.
2761 * XXX ideally should only compute this if @-/@+
2762 * has been seen, a la PL_sawampersand ??? */
2764 /* If there's a direct correspondence between the
2765 * string which we're matching and the original SV,
2766 * then we can use the utf8 len cache associated with
2767 * the SV. In particular, it means that under //g,
2768 * sv_pos_b2u() will use the previously cached
2769 * position to speed up working out the new length of
2770 * subcoffset, rather than counting from the start of
2771 * the string each time. This stops
2772 * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g;
2773 * from going quadratic */
2774 if (SvPOKp(sv) && SvPVX(sv) == strbeg)
2775 prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset,
2776 SV_GMAGIC|SV_CONST_RETURN);
2778 prog->subcoffset = utf8_length((U8*)strbeg,
2779 (U8*)(strbeg+prog->suboffset));
2783 RX_MATCH_COPY_FREE(rx);
2784 prog->subbeg = strbeg;
2785 prog->suboffset = 0;
2786 prog->subcoffset = 0;
2787 prog->sublen = strend - strbeg;
2795 - regexec_flags - match a regexp against a string
2798 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend,
2799 char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags)
2800 /* stringarg: the point in the string at which to begin matching */
2801 /* strend: pointer to null at end of string */
2802 /* strbeg: real beginning of string */
2803 /* minend: end of match must be >= minend bytes after stringarg. */
2804 /* sv: SV being matched: only used for utf8 flag, pos() etc; string
2805 * itself is accessed via the pointers above */
2806 /* data: May be used for some additional optimizations.
2807 Currently unused. */
2808 /* flags: For optimizations. See REXEC_* in regexp.h */
2811 struct regexp *const prog = ReANY(rx);
2815 SSize_t minlen; /* must match at least this many chars */
2816 SSize_t dontbother = 0; /* how many characters not to try at end */
2817 const bool utf8_target = cBOOL(DO_UTF8(sv));
2819 RXi_GET_DECL(prog,progi);
2820 regmatch_info reginfo_buf; /* create some info to pass to regtry etc */
2821 regmatch_info *const reginfo = ®info_buf;
2822 regexp_paren_pair *swap = NULL;
2824 GET_RE_DEBUG_FLAGS_DECL;
2826 PERL_ARGS_ASSERT_REGEXEC_FLAGS;
2827 PERL_UNUSED_ARG(data);
2829 /* Be paranoid... */
2831 Perl_croak(aTHX_ "NULL regexp parameter");
2835 debug_start_match(rx, utf8_target, stringarg, strend,
2839 startpos = stringarg;
2841 if (prog->intflags & PREGf_GPOS_SEEN) {
2844 /* set reginfo->ganch, the position where \G can match */
2847 (flags & REXEC_IGNOREPOS)
2848 ? stringarg /* use start pos rather than pos() */
2849 : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
2850 /* Defined pos(): */
2851 ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
2852 : strbeg; /* pos() not defined; use start of string */
2854 DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log,
2855 "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg)));
2857 /* in the presence of \G, we may need to start looking earlier in
2858 * the string than the suggested start point of stringarg:
2859 * if prog->gofs is set, then that's a known, fixed minimum
2862 * /ab|c\G/: gofs = 1
2863 * or if the minimum offset isn't known, then we have to go back
2864 * to the start of the string, e.g. /w+\G/
2867 if (prog->intflags & PREGf_ANCH_GPOS) {
2868 startpos = reginfo->ganch - prog->gofs;
2870 ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
2872 DEBUG_r(PerlIO_printf(Perl_debug_log,
2873 "fail: ganch-gofs before earliest possible start\n"));
2877 else if (prog->gofs) {
2878 if (startpos - prog->gofs < strbeg)
2881 startpos -= prog->gofs;
2883 else if (prog->intflags & PREGf_GPOS_FLOAT)
2887 minlen = prog->minlen;
2888 if ((startpos + minlen) > strend || startpos < strbeg) {
2889 DEBUG_r(PerlIO_printf(Perl_debug_log,
2890 "Regex match can't succeed, so not even tried\n"));
2894 /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave),
2895 * which will call destuctors to reset PL_regmatch_state, free higher
2896 * PL_regmatch_slabs, and clean up regmatch_info_aux and
2897 * regmatch_info_aux_eval */
2899 oldsave = PL_savestack_ix;
2903 if ((prog->extflags & RXf_USE_INTUIT)
2904 && !(flags & REXEC_CHECKED))
2906 s = re_intuit_start(rx, sv, strbeg, startpos, strend,
2911 if (prog->extflags & RXf_CHECK_ALL) {
2912 /* we can match based purely on the result of INTUIT.
2913 * Set up captures etc just for $& and $-[0]
2914 * (an intuit-only match wont have $1,$2,..) */
2915 assert(!prog->nparens);
2917 /* s/// doesn't like it if $& is earlier than where we asked it to
2918 * start searching (which can happen on something like /.\G/) */
2919 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
2922 /* this should only be possible under \G */
2923 assert(prog->intflags & PREGf_GPOS_SEEN);
2924 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2925 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
2929 /* match via INTUIT shouldn't have any captures.
2930 * Let @-, @+, $^N know */
2931 prog->lastparen = prog->lastcloseparen = 0;
2932 RX_MATCH_UTF8_set(rx, utf8_target);
2933 prog->offs[0].start = s - strbeg;
2934 prog->offs[0].end = utf8_target
2935 ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg
2936 : s - strbeg + prog->minlenret;
2937 if ( !(flags & REXEC_NOT_FIRST) )
2938 S_reg_set_capture_string(aTHX_ rx,
2940 sv, flags, utf8_target);
2946 multiline = prog->extflags & RXf_PMf_MULTILINE;
2948 if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) {
2949 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
2950 "String too short [regexec_flags]...\n"));
2954 /* Check validity of program. */
2955 if (UCHARAT(progi->program) != REG_MAGIC) {
2956 Perl_croak(aTHX_ "corrupted regexp program");
2959 RX_MATCH_TAINTED_off(rx);
2960 RX_MATCH_UTF8_set(rx, utf8_target);
2962 reginfo->prog = rx; /* Yes, sorry that this is confusing. */
2963 reginfo->intuit = 0;
2964 reginfo->is_utf8_target = cBOOL(utf8_target);
2965 reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
2966 reginfo->warned = FALSE;
2967 reginfo->strbeg = strbeg;
2969 reginfo->poscache_maxiter = 0; /* not yet started a countdown */
2970 reginfo->strend = strend;
2971 /* see how far we have to get to not match where we matched before */
2972 reginfo->till = stringarg + minend;
2974 if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv)) {
2975 /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after
2976 S_cleanup_regmatch_info_aux has executed (registered by
2977 SAVEDESTRUCTOR_X below). S_cleanup_regmatch_info_aux modifies
2978 magic belonging to this SV.
2979 Not newSVsv, either, as it does not COW.
2981 reginfo->sv = newSV(0);
2982 SvSetSV_nosteal(reginfo->sv, sv);
2983 SAVEFREESV(reginfo->sv);
2986 /* reserve next 2 or 3 slots in PL_regmatch_state:
2987 * slot N+0: may currently be in use: skip it
2988 * slot N+1: use for regmatch_info_aux struct
2989 * slot N+2: use for regmatch_info_aux_eval struct if we have (?{})'s
2990 * slot N+3: ready for use by regmatch()
2994 regmatch_state *old_regmatch_state;
2995 regmatch_slab *old_regmatch_slab;
2996 int i, max = (prog->extflags & RXf_EVAL_SEEN) ? 2 : 1;
2998 /* on first ever match, allocate first slab */
2999 if (!PL_regmatch_slab) {
3000 Newx(PL_regmatch_slab, 1, regmatch_slab);
3001 PL_regmatch_slab->prev = NULL;
3002 PL_regmatch_slab->next = NULL;
3003 PL_regmatch_state = SLAB_FIRST(PL_regmatch_slab);
3006 old_regmatch_state = PL_regmatch_state;
3007 old_regmatch_slab = PL_regmatch_slab;
3009 for (i=0; i <= max; i++) {
3011 reginfo->info_aux = &(PL_regmatch_state->u.info_aux);
3013 reginfo->info_aux_eval =
3014 reginfo->info_aux->info_aux_eval =
3015 &(PL_regmatch_state->u.info_aux_eval);
3017 if (++PL_regmatch_state > SLAB_LAST(PL_regmatch_slab))
3018 PL_regmatch_state = S_push_slab(aTHX);
3021 /* note initial PL_regmatch_state position; at end of match we'll
3022 * pop back to there and free any higher slabs */
3024 reginfo->info_aux->old_regmatch_state = old_regmatch_state;
3025 reginfo->info_aux->old_regmatch_slab = old_regmatch_slab;
3026 reginfo->info_aux->poscache = NULL;
3028 SAVEDESTRUCTOR_X(S_cleanup_regmatch_info_aux, reginfo->info_aux);
3030 if ((prog->extflags & RXf_EVAL_SEEN))
3031 S_setup_eval_state(aTHX_ reginfo);
3033 reginfo->info_aux_eval = reginfo->info_aux->info_aux_eval = NULL;
3036 /* If there is a "must appear" string, look for it. */
3038 if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) {
3039 /* We have to be careful. If the previous successful match
3040 was from this regex we don't want a subsequent partially
3041 successful match to clobber the old results.
3042 So when we detect this possibility we add a swap buffer
3043 to the re, and switch the buffer each match. If we fail,
3044 we switch it back; otherwise we leave it swapped.
3047 /* do we need a save destructor here for eval dies? */
3048 Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair);
3049 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3050 "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n",
3057 /* Simplest case: anchored match need be tried only once, or with
3058 * MBOL, only at the beginning of each line.
3060 * Note that /.*.../ sets PREGf_IMPLICIT|MBOL, while /.*.../s sets
3061 * PREGf_IMPLICIT|SBOL. The idea is that with /.*.../s, if it doesn't
3062 * match at the start of the string then it won't match anywhere else
3063 * either; while with /.*.../, if it doesn't match at the beginning,
3064 * the earliest it could match is at the start of the next line */
3066 if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
3069 if (regtry(reginfo, &s))
3072 if (!(prog->intflags & PREGf_ANCH_MBOL))
3075 /* didn't match at start, try at other newline positions */
3078 dontbother = minlen - 1;
3079 end = HOP3c(strend, -dontbother, strbeg) - 1;
3081 /* skip to next newline */
3083 while (s <= end) { /* note it could be possible to match at the end of the string */
3084 /* NB: newlines are the same in unicode as they are in latin */
3087 if (prog->check_substr || prog->check_utf8) {
3088 /* note that with PREGf_IMPLICIT, intuit can only fail
3089 * or return the start position, so it's of limited utility.
3090 * Nevertheless, I made the decision that the potential for
3091 * quick fail was still worth it - DAPM */
3092 s = re_intuit_start(rx, sv, strbeg, s, strend, flags, NULL);
3096 if (regtry(reginfo, &s))
3100 } /* end anchored search */
3102 if (prog->intflags & PREGf_ANCH_GPOS)
3104 /* PREGf_ANCH_GPOS should never be true if PREGf_GPOS_SEEN is not true */
3105 assert(prog->intflags & PREGf_GPOS_SEEN);
3106 /* For anchored \G, the only position it can match from is
3107 * (ganch-gofs); we already set startpos to this above; if intuit
3108 * moved us on from there, we can't possibly succeed */
3109 assert(startpos == reginfo->ganch - prog->gofs);
3110 if (s == startpos && regtry(reginfo, &s))
3115 /* Messy cases: unanchored match. */
3116 if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) {
3117 /* we have /x+whatever/ */
3118 /* it must be a one character string (XXXX Except is_utf8_pat?) */
3124 if (! prog->anchored_utf8) {
3125 to_utf8_substr(prog);
3127 ch = SvPVX_const(prog->anchored_utf8)[0];
3130 DEBUG_EXECUTE_r( did_match = 1 );
3131 if (regtry(reginfo, &s)) goto got_it;
3133 while (s < strend && *s == ch)
3140 if (! prog->anchored_substr) {
3141 if (! to_byte_substr(prog)) {
3142 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3145 ch = SvPVX_const(prog->anchored_substr)[0];
3148 DEBUG_EXECUTE_r( did_match = 1 );
3149 if (regtry(reginfo, &s)) goto got_it;
3151 while (s < strend && *s == ch)
3156 DEBUG_EXECUTE_r(if (!did_match)
3157 PerlIO_printf(Perl_debug_log,
3158 "Did not find anchored character...\n")
3161 else if (prog->anchored_substr != NULL
3162 || prog->anchored_utf8 != NULL
3163 || ((prog->float_substr != NULL || prog->float_utf8 != NULL)
3164 && prog->float_max_offset < strend - s)) {
3169 char *last1; /* Last position checked before */
3173 if (prog->anchored_substr || prog->anchored_utf8) {
3175 if (! prog->anchored_utf8) {
3176 to_utf8_substr(prog);
3178 must = prog->anchored_utf8;
3181 if (! prog->anchored_substr) {
3182 if (! to_byte_substr(prog)) {
3183 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3186 must = prog->anchored_substr;
3188 back_max = back_min = prog->anchored_offset;
3191 if (! prog->float_utf8) {
3192 to_utf8_substr(prog);
3194 must = prog->float_utf8;
3197 if (! prog->float_substr) {
3198 if (! to_byte_substr(prog)) {
3199 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3202 must = prog->float_substr;
3204 back_max = prog->float_max_offset;
3205 back_min = prog->float_min_offset;
3211 last = HOP3c(strend, /* Cannot start after this */
3212 -(SSize_t)(CHR_SVLEN(must)
3213 - (SvTAIL(must) != 0) + back_min), strbeg);
3215 if (s > reginfo->strbeg)
3216 last1 = HOPc(s, -1);
3218 last1 = s - 1; /* bogus */
3220 /* XXXX check_substr already used to find "s", can optimize if
3221 check_substr==must. */
3223 strend = HOPc(strend, -dontbother);
3224 while ( (s <= last) &&
3225 (s = fbm_instr((unsigned char*)HOP4c(s, back_min, strbeg, strend),
3226 (unsigned char*)strend, must,
3227 multiline ? FBMrf_MULTILINE : 0)) ) {
3228 DEBUG_EXECUTE_r( did_match = 1 );
3229 if (HOPc(s, -back_max) > last1) {
3230 last1 = HOPc(s, -back_min);
3231 s = HOPc(s, -back_max);
3234 char * const t = (last1 >= reginfo->strbeg)
3235 ? HOPc(last1, 1) : last1 + 1;
3237 last1 = HOPc(s, -back_min);
3241 while (s <= last1) {
3242 if (regtry(reginfo, &s))
3245 s++; /* to break out of outer loop */
3252 while (s <= last1) {
3253 if (regtry(reginfo, &s))
3259 DEBUG_EXECUTE_r(if (!did_match) {
3260 RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
3261 SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
3262 PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n",
3263 ((must == prog->anchored_substr || must == prog->anchored_utf8)
3264 ? "anchored" : "floating"),
3265 quoted, RE_SV_TAIL(must));
3269 else if ( (c = progi->regstclass) ) {
3271 const OPCODE op = OP(progi->regstclass);
3272 /* don't bother with what can't match */
3273 if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
3274 strend = HOPc(strend, -(minlen - 1));
3277 SV * const prop = sv_newmortal();
3278 regprop(prog, prop, c, reginfo, NULL);
3280 RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
3282 PerlIO_printf(Perl_debug_log,
3283 "Matching stclass %.*s against %s (%d bytes)\n",
3284 (int)SvCUR(prop), SvPVX_const(prop),
3285 quoted, (int)(strend - s));
3288 if (find_byclass(prog, c, s, strend, reginfo))
3290 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Contradicts stclass... [regexec_flags]\n"));
3294 if (prog->float_substr != NULL || prog->float_utf8 != NULL) {
3302 if (! prog->float_utf8) {
3303 to_utf8_substr(prog);
3305 float_real = prog->float_utf8;
3308 if (! prog->float_substr) {
3309 if (! to_byte_substr(prog)) {
3310 NON_UTF8_TARGET_BUT_UTF8_REQUIRED(phooey);
3313 float_real = prog->float_substr;
3316 little = SvPV_const(float_real, len);
3317 if (SvTAIL(float_real)) {
3318 /* This means that float_real contains an artificial \n on
3319 * the end due to the presence of something like this:
3320 * /foo$/ where we can match both "foo" and "foo\n" at the
3321 * end of the string. So we have to compare the end of the
3322 * string first against the float_real without the \n and
3323 * then against the full float_real with the string. We
3324 * have to watch out for cases where the string might be
3325 * smaller than the float_real or the float_real without
3327 char *checkpos= strend - len;
3329 PerlIO_printf(Perl_debug_log,
3330 "%sChecking for float_real.%s\n",
3331 PL_colors[4], PL_colors[5]));
3332 if (checkpos + 1 < strbeg) {
3333 /* can't match, even if we remove the trailing \n
3334 * string is too short to match */
3336 PerlIO_printf(Perl_debug_log,
3337 "%sString shorter than required trailing substring, cannot match.%s\n",
3338 PL_colors[4], PL_colors[5]));
3340 } else if (memEQ(checkpos + 1, little, len - 1)) {
3341 /* can match, the end of the string matches without the
3343 last = checkpos + 1;
3344 } else if (checkpos < strbeg) {
3345 /* cant match, string is too short when the "\n" is
3348 PerlIO_printf(Perl_debug_log,
3349 "%sString does not contain required trailing substring, cannot match.%s\n",
3350 PL_colors[4], PL_colors[5]));
3352 } else if (!multiline) {
3353 /* non multiline match, so compare with the "\n" at the
3354 * end of the string */
3355 if (memEQ(checkpos, little, len)) {
3359 PerlIO_printf(Perl_debug_log,
3360 "%sString does not contain required trailing substring, cannot match.%s\n",
3361 PL_colors[4], PL_colors[5]));
3365 /* multiline match, so we have to search for a place
3366 * where the full string is located */
3372 last = rninstr(s, strend, little, little + len);
3374 last = strend; /* matching "$" */
3377 /* at one point this block contained a comment which was
3378 * probably incorrect, which said that this was a "should not
3379 * happen" case. Even if it was true when it was written I am
3380 * pretty sure it is not anymore, so I have removed the comment
3381 * and replaced it with this one. Yves */
3383 PerlIO_printf(Perl_debug_log,
3384 "%sString does not contain required substring, cannot match.%s\n",
3385 PL_colors[4], PL_colors[5]
3389 dontbother = strend - last + prog->float_min_offset;
3391 if (minlen && (dontbother < minlen))
3392 dontbother = minlen - 1;
3393 strend -= dontbother; /* this one's always in bytes! */
3394 /* We don't know much -- general case. */
3397 if (regtry(reginfo, &s))
3406 if (regtry(reginfo, &s))
3408 } while (s++ < strend);
3416 /* s/// doesn't like it if $& is earlier than where we asked it to
3417 * start searching (which can happen on something like /.\G/) */
3418 if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
3419 && (prog->offs[0].start < stringarg - strbeg))
3421 /* this should only be possible under \G */
3422 assert(prog->intflags & PREGf_GPOS_SEEN);
3423 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
3424 "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n"));
3430 PerlIO_printf(Perl_debug_log,
3431 "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n",
3438 /* clean up; this will trigger destructors that will free all slabs
3439 * above the current one, and cleanup the regmatch_info_aux
3440 * and regmatch_info_aux_eval sructs */
3442 LEAVE_SCOPE(oldsave);
3444 if (RXp_PAREN_NAMES(prog))
3445 (void)hv_iterinit(RXp_PAREN_NAMES(prog));
3447 /* make sure $`, $&, $', and $digit will work later */
3448 if ( !(flags & REXEC_NOT_FIRST) )
3449 S_reg_set_capture_string(aTHX_ rx,
3450 strbeg, reginfo->strend,
3451 sv, flags, utf8_target);
3456 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
3457 PL_colors[4], PL_colors[5]));
3459 /* clean up; this will trigger destructors that will free all slabs
3460 * above the current one, and cleanup the regmatch_info_aux
3461 * and regmatch_info_aux_eval sructs */
3463 LEAVE_SCOPE(oldsave);
3466 /* we failed :-( roll it back */
3467 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
3468 "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n",
3473 Safefree(prog->offs);
3480 /* Set which rex is pointed to by PL_reg_curpm, handling ref counting.
3481 * Do inc before dec, in case old and new rex are the same */
3482 #define SET_reg_curpm(Re2) \
3483 if (reginfo->info_aux_eval) { \
3484 (void)ReREFCNT_inc(Re2); \
3485 ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \
3486 PM_SETRE((PL_reg_curpm), (Re2)); \
3491 - regtry - try match at specific point
3493 STATIC I32 /* 0 failure, 1 success */
3494 S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
3497 REGEXP *const rx = reginfo->prog;
3498 regexp *const prog = ReANY(rx);
3500 RXi_GET_DECL(prog,progi);
3501 GET_RE_DEBUG_FLAGS_DECL;
3503 PERL_ARGS_ASSERT_REGTRY;
3505 reginfo->cutpoint=NULL;
3507 prog->offs[0].start = *startposp - reginfo->strbeg;
3508 prog->lastparen = 0;
3509 prog->lastcloseparen = 0;
3511 /* XXXX What this code is doing here?!!! There should be no need
3512 to do this again and again, prog->lastparen should take care of
3515 /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code.
3516 * Actually, the code in regcppop() (which Ilya may be meaning by
3517 * prog->lastparen), is not needed at all by the test suite
3518 * (op/regexp, op/pat, op/split), but that code is needed otherwise
3519 * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/
3520 * Meanwhile, this code *is* needed for the
3521 * above-mentioned test suite tests to succeed. The common theme
3522 * on those tests seems to be returning null fields from matches.
3523 * --jhi updated by dapm */
3525 if (prog->nparens) {
3526 regexp_paren_pair *pp = prog->offs;
3528 for (i = prog->nparens; i > (I32)prog->lastparen; i--) {
3536 result = regmatch(reginfo, *startposp, progi->program + 1);
3538 prog->offs[0].end = result;
3541 if (reginfo->cutpoint)
3542 *startposp= reginfo->cutpoint;
3543 REGCP_UNWIND(lastcp);
3548 #define sayYES goto yes
3549 #define sayNO goto no
3550 #define sayNO_SILENT goto no_silent
3552 /* we dont use STMT_START/END here because it leads to
3553 "unreachable code" warnings, which are bogus, but distracting. */
3554 #define CACHEsayNO \
3555 if (ST.cache_mask) \
3556 reginfo->info_aux->poscache[ST.cache_offset] |= ST.cache_mask; \
3559 /* this is used to determine how far from the left messages like
3560 'failed...' are printed. It should be set such that messages
3561 are inline with the regop output that created them.
3563 #define REPORT_CODE_OFF 32
3566 #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */
3567 #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */
3568 #define CHRTEST_NOT_A_CP_1 -999
3569 #define CHRTEST_NOT_A_CP_2 -998
3571 /* grab a new slab and return the first slot in it */
3573 STATIC regmatch_state *
3576 #if PERL_VERSION < 9 && !defined(PERL_CORE)
3579 regmatch_slab *s = PL_regmatch_slab->next;
3581 Newx(s, 1, regmatch_slab);
3582 s->prev = PL_regmatch_slab;
3584 PL_regmatch_slab->next = s;
3586 PL_regmatch_slab = s;
3587 return SLAB_FIRST(s);
3591 /* push a new state then goto it */
3593 #define PUSH_STATE_GOTO(state, node, input) \
3594 pushinput = input; \
3596 st->resume_state = state; \
3599 /* push a new state with success backtracking, then goto it */
3601 #define PUSH_YES_STATE_GOTO(state, node, input) \
3602 pushinput = input; \
3604 st->resume_state = state; \
3605 goto push_yes_state;
3612 regmatch() - main matching routine
3614 This is basically one big switch statement in a loop. We execute an op,
3615 set 'next' to point the next op, and continue. If we come to a point which
3616 we may need to backtrack to on failure such as (A|B|C), we push a
3617 backtrack state onto the backtrack stack. On failure, we pop the top
3618 state, and re-enter the loop at the state indicated. If there are no more
3619 states to pop, we return failure.
3621 Sometimes we also need to backtrack on success; for example /A+/, where
3622 after successfully matching one A, we need to go back and try to
3623 match another one; similarly for lookahead assertions: if the assertion
3624 completes successfully, we backtrack to the state just before the assertion
3625 and then carry on. In these cases, the pushed state is marked as
3626 'backtrack on success too'. This marking is in fact done by a chain of
3627 pointers, each pointing to the previous 'yes' state. On success, we pop to
3628 the nearest yes state, discarding any intermediate failure-only states.
3629 Sometimes a yes state is pushed just to force some cleanup code to be
3630 called at the end of a successful match or submatch; e.g. (??{$re}) uses
3631 it to free the inner regex.
3633 Note that failure backtracking rewinds the cursor position, while
3634 success backtracking leaves it alone.
3636 A pattern is complete when the END op is executed, while a subpattern
3637 such as (?=foo) is complete when the SUCCESS op is executed. Both of these
3638 ops trigger the "pop to last yes state if any, otherwise return true"
3641 A common convention in this function is to use A and B to refer to the two
3642 subpatterns (or to the first nodes thereof) in patterns like /A*B/: so A is
3643 the subpattern to be matched possibly multiple times, while B is the entire
3644 rest of the pattern. Variable and state names reflect this convention.
3646 The states in the main switch are the union of ops and failure/success of
3647 substates associated with with that op. For example, IFMATCH is the op
3648 that does lookahead assertions /(?=A)B/ and so the IFMATCH state means
3649 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just
3650 successfully matched A and IFMATCH_A_fail is a state saying that we have
3651 just failed to match A. Resume states always come in pairs. The backtrack
3652 state we push is marked as 'IFMATCH_A', but when that is popped, we resume
3653 at IFMATCH_A or IFMATCH_A_fail, depending on whether we are backtracking
3654 on success or failure.
3656 The struct that holds a backtracking state is actually a big union, with
3657 one variant for each major type of op. The variable st points to the
3658 top-most backtrack struct. To make the code clearer, within each
3659 block of code we #define ST to alias the relevant union.
3661 Here's a concrete example of a (vastly oversimplified) IFMATCH
3667 #define ST st->u.ifmatch
3669 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3670 ST.foo = ...; // some state we wish to save
3672 // push a yes backtrack state with a resume value of
3673 // IFMATCH_A/IFMATCH_A_fail, then continue execution at the
3675 PUSH_YES_STATE_GOTO(IFMATCH_A, A, newinput);
3678 case IFMATCH_A: // we have successfully executed A; now continue with B
3680 bar = ST.foo; // do something with the preserved value
3683 case IFMATCH_A_fail: // A failed, so the assertion failed
3684 ...; // do some housekeeping, then ...
3685 sayNO; // propagate the failure
3692 For any old-timers reading this who are familiar with the old recursive
3693 approach, the code above is equivalent to:
3695 case IFMATCH: // we are executing the IFMATCH op, (?=A)B
3704 ...; // do some housekeeping, then ...
3705 sayNO; // propagate the failure
3708 The topmost backtrack state, pointed to by st, is usually free. If you
3709 want to claim it, populate any ST.foo fields in it with values you wish to
3710 save, then do one of
3712 PUSH_STATE_GOTO(resume_state, node, newinput);
3713 PUSH_YES_STATE_GOTO(resume_state, node, newinput);
3715 which sets that backtrack state's resume value to 'resume_state', pushes a
3716 new free entry to the top of the backtrack stack, then goes to 'node'.
3717 On backtracking, the free slot is popped, and the saved state becomes the
3718 new free state. An ST.foo field in this new top state can be temporarily
3719 accessed to retrieve values, but once the main loop is re-entered, it
3720 becomes available for reuse.
3722 Note that the depth of the backtrack stack constantly increases during the
3723 left-to-right execution of the pattern, rather than going up and down with
3724 the pattern nesting. For example the stack is at its maximum at Z at the
3725 end of the pattern, rather than at X in the following:
3727 /(((X)+)+)+....(Y)+....Z/
3729 The only exceptions to this are lookahead/behind assertions and the cut,
3730 (?>A), which pop all the backtrack states associated with A before
3733 Backtrack state structs are allocated in slabs of about 4K in size.
3734 PL_regmatch_state and st always point to the currently active state,
3735 and PL_regmatch_slab points to the slab currently containing
3736 PL_regmatch_state. The first time regmatch() is called, the first slab is
3737 allocated, and is never freed until interpreter destruction. When the slab
3738 is full, a new one is allocated and chained to the end. At exit from
3739 regmatch(), slabs allocated since entry are freed.
3744 #define DEBUG_STATE_pp(pp) \
3746 DUMP_EXEC_POS(locinput, scan, utf8_target); \
3747 PerlIO_printf(Perl_debug_log, \
3748 " %*s"pp" %s%s%s%s%s\n", \
3750 PL_reg_name[st->resume_state], \
3751 ((st==yes_state||st==mark_state) ? "[" : ""), \
3752 ((st==yes_state) ? "Y" : ""), \
3753 ((st==mark_state) ? "M" : ""), \
3754 ((st==yes_state||st==mark_state) ? "]" : "") \
3759 #define REG_NODE_NUM(x) ((x) ? (int)((x)-prog) : -1)
3764 S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target,
3765 const char *start, const char *end, const char *blurb)
3767 const bool utf8_pat = RX_UTF8(prog) ? 1 : 0;
3769 PERL_ARGS_ASSERT_DEBUG_START_MATCH;
3774 RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0),
3775 RX_PRECOMP_const(prog), RX_PRELEN(prog), 60);
3777 RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1),
3778 start, end - start, 60);
3780 PerlIO_printf(Perl_debug_log,
3781 "%s%s REx%s %s against %s\n",
3782 PL_colors[4], blurb, PL_colors[5], s0, s1);
3784 if (utf8_target||utf8_pat)
3785 PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n",
3786 utf8_pat ? "pattern" : "",
3787 utf8_pat && utf8_target ? " and " : "",
3788 utf8_target ? "string" : ""
3794 S_dump_exec_pos(pTHX_ const char *locinput,
3795 const regnode *scan,
3796 const char *loc_regeol,
3797 const char *loc_bostr,
3798 const char *loc_reg_starttry,
3799 const bool utf8_target)
3801 const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4];
3802 const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */
3803 int l = (loc_regeol - locinput) > taill ? taill : (loc_regeol - locinput);
3804 /* The part of the string before starttry has one color
3805 (pref0_len chars), between starttry and current
3806 position another one (pref_len - pref0_len chars),
3807 after the current position the third one.
3808 We assume that pref0_len <= pref_len, otherwise we
3809 decrease pref0_len. */
3810 int pref_len = (locinput - loc_bostr) > (5 + taill) - l
3811 ? (5 + taill) - l : locinput - loc_bostr;
3814 PERL_ARGS_ASSERT_DUMP_EXEC_POS;
3816 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len)))
3818 pref0_len = pref_len - (locinput - loc_reg_starttry);
3819 if (l + pref_len < (5 + taill) && l < loc_regeol - locinput)
3820 l = ( loc_regeol - locinput > (5 + taill) - pref_len
3821 ? (5 + taill) - pref_len : loc_regeol - locinput);
3822 while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l)))
3826 if (pref0_len > pref_len)
3827 pref0_len = pref_len;
3829 const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
3831 RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
3832 (locinput - pref_len),pref0_len, 60, 4, 5);
3834 RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1),
3835 (locinput - pref_len + pref0_len),
3836 pref_len - pref0_len, 60, 2, 3);
3838 RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2),
3839 locinput, loc_regeol - locinput, 10, 0, 1);
3841 const STRLEN tlen=len0+len1+len2;
3842 PerlIO_printf(Perl_debug_log,
3843 "%4"IVdf" <%.*s%.*s%s%.*s>%*s|",
3844 (IV)(locinput - loc_bostr),
3847 (docolor ? "" : "> <"),
3849 (int)(tlen > 19 ? 0 : 19 - tlen),
3856 /* reg_check_named_buff_matched()
3857 * Checks to see if a named buffer has matched. The data array of
3858 * buffer numbers corresponding to the buffer is expected to reside
3859 * in the regexp->data->data array in the slot stored in the ARG() of
3860 * node involved. Note that this routine doesn't actually care about the
3861 * name, that information is not preserved from compilation to execution.
3862 * Returns the index of the leftmost defined buffer with the given name
3863 * or 0 if non of the buffers matched.
3866 S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan)
3869 RXi_GET_DECL(rex,rexi);
3870 SV *sv_dat= MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
3871 I32 *nums=(I32*)SvPVX(sv_dat);
3873 PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED;
3875 for ( n=0; n<SvIVX(sv_dat); n++ ) {
3876 if ((I32)rex->lastparen >= nums[n] &&
3877 rex->offs[nums[n]].end != -1)
3887 S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p,
3888 U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo)
3890 /* This function determines if there are one or two characters that match
3891 * the first character of the passed-in EXACTish node <text_node>, and if
3892 * so, returns them in the passed-in pointers.
3894 * If it determines that no possible character in the target string can
3895 * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if
3896 * the first character in <text_node> requires UTF-8 to represent, and the
3897 * target string isn't in UTF-8.)
3899 * If there are more than two characters that could match the beginning of
3900 * <text_node>, or if more context is required to determine a match or not,
3901 * it sets both *<c1p> and *<c2p> to CHRTEST_VOID.
3903 * The motiviation behind this function is to allow the caller to set up
3904 * tight loops for matching. If <text_node> is of type EXACT, there is
3905 * only one possible character that can match its first character, and so
3906 * the situation is quite simple. But things get much more complicated if
3907 * folding is involved. It may be that the first character of an EXACTFish
3908 * node doesn't participate in any possible fold, e.g., punctuation, so it
3909 * can be matched only by itself. The vast majority of characters that are
3910 * in folds match just two things, their lower and upper-case equivalents.
3911 * But not all are like that; some have multiple possible matches, or match
3912 * sequences of more than one character. This function sorts all that out.
3914 * Consider the patterns A*B or A*?B where A and B are arbitrary. In a
3915 * loop of trying to match A*, we know we can't exit where the thing
3916 * following it isn't a B. And something can't be a B unless it is the
3917 * beginning of B. By putting a quick test for that beginning in a tight
3918 * loop, we can rule out things that can't possibly be B without having to
3919 * break out of the loop, thus avoiding work. Similarly, if A is a single
3920 * character, we can make a tight loop matching A*, using the outputs of
3923 * If the target string to match isn't in UTF-8, and there aren't
3924 * complications which require CHRTEST_VOID, *<c1p> and *<c2p> are set to
3925 * the one or two possible octets (which are characters in this situation)
3926 * that can match. In all cases, if there is only one character that can
3927 * match, *<c1p> and *<c2p> will be identical.
3929 * If the target string is in UTF-8, the buffers pointed to by <c1_utf8>
3930 * and <c2_utf8> will contain the one or two UTF-8 sequences of bytes that
3931 * can match the beginning of <text_node>. They should be declared with at
3932 * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is
3933 * undefined what these contain.) If one or both of the buffers are
3934 * invariant under UTF-8, *<c1p>, and *<c2p> will also be set to the
3935 * corresponding invariant. If variant, the corresponding *<c1p> and/or
3936 * *<c2p> will be set to a negative number(s) that shouldn't match any code
3937 * point (unless inappropriately coerced to unsigned). *<c1p> will equal
3938 * *<c2p> if and only if <c1_utf8> and <c2_utf8> are the same. */
3940 const bool utf8_target = reginfo->is_utf8_target;
3942 UV c1 = (UV)CHRTEST_NOT_A_CP_1;
3943 UV c2 = (UV)CHRTEST_NOT_A_CP_2;
3944 bool use_chrtest_void = FALSE;
3945 const bool is_utf8_pat = reginfo->is_utf8_pat;
3947 /* Used when we have both utf8 input and utf8 output, to avoid converting
3948 * to/from code points */
3949 bool utf8_has_been_setup = FALSE;
3953 U8 *pat = (U8*)STRING(text_node);
3954 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
3956 if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
3958 /* In an exact node, only one thing can be matched, that first
3959 * character. If both the pat and the target are UTF-8, we can just
3960 * copy the input to the output, avoiding finding the code point of
3965 else if (utf8_target) {
3966 Copy(pat, c1_utf8, UTF8SKIP(pat), U8);
3967 Copy(pat, c2_utf8, UTF8SKIP(pat), U8);
3968 utf8_has_been_setup = TRUE;
3971 c2 = c1 = valid_utf8_to_uvchr(pat, NULL);
3974 else { /* an EXACTFish node */
3975 U8 *pat_end = pat + STR_LEN(text_node);
3977 /* An EXACTFL node has at least some characters unfolded, because what
3978 * they match is not known until now. So, now is the time to fold
3979 * the first few of them, as many as are needed to determine 'c1' and
3980 * 'c2' later in the routine. If the pattern isn't UTF-8, we only need
3981 * to fold if in a UTF-8 locale, and then only the Sharp S; everything
3982 * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we
3983 * need to fold as many characters as a single character can fold to,
3984 * so that later we can check if the first ones are such a multi-char
3985 * fold. But, in such a pattern only locale-problematic characters
3986 * aren't folded, so we can skip this completely if the first character
3987 * in the node isn't one of the tricky ones */
3988 if (OP(text_node) == EXACTFL) {
3990 if (! is_utf8_pat) {
3991 if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S)
3993 folded[0] = folded[1] = 's';
3995 pat_end = folded + 2;
3998 else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) {
4003 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) {
4005 *(d++) = (U8) toFOLD_LC(*s);
4010 _to_utf8_fold_flags(s,
4013 FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE);
4024 if ((is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end))
4025 || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end)))
4027 /* Multi-character folds require more context to sort out. Also
4028 * PL_utf8_foldclosures used below doesn't handle them, so have to
4029 * be handled outside this routine */
4030 use_chrtest_void = TRUE;
4032 else { /* an EXACTFish node which doesn't begin with a multi-char fold */
4033 c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat;
4035 /* Load the folds hash, if not already done */
4037 if (! PL_utf8_foldclosures) {
4038 _load_PL_utf8_foldclosures();
4041 /* The fold closures data structure is a hash with the keys
4042 * being the UTF-8 of every character that is folded to, like
4043 * 'k', and the values each an array of all code points that
4044 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
4045 * Multi-character folds are not included */
4046 if ((! (listp = hv_fetch(PL_utf8_foldclosures,
4051 /* Not found in the hash, therefore there are no folds
4052 * containing it, so there is only a single character that
4056 else { /* Does participate in folds */
4057 AV* list = (AV*) *listp;
4058 if (av_tindex(list) != 1) {
4060 /* If there aren't exactly two folds to this, it is
4061 * outside the scope of this function */
4062 use_chrtest_void = TRUE;
4064 else { /* There are two. Get them */
4065 SV** c_p = av_fetch(list, 0, FALSE);
4067 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4071 c_p = av_fetch(list, 1, FALSE);
4073 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
4077 /* Folds that cross the 255/256 boundary are forbidden
4078 * if EXACTFL (and isnt a UTF8 locale), or EXACTFA and
4079 * one is ASCIII. Since the pattern character is above
4080 * 255, and its only other match is below 256, the only
4081 * legal match will be to itself. We have thrown away
4082 * the original, so have to compute which is the one
4084 if ((c1 < 256) != (c2 < 256)) {
4085 if ((OP(text_node) == EXACTFL
4086 && ! IN_UTF8_CTYPE_LOCALE)
4087 || ((OP(text_node) == EXACTFA
4088 || OP(text_node) == EXACTFA_NO_TRIE)
4089 && (isASCII(c1) || isASCII(c2))))
4102 else /* Here, c1 is <= 255 */
4104 && HAS_NONLATIN1_FOLD_CLOSURE(c1)
4105 && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE))
4106 && ((OP(text_node) != EXACTFA
4107 && OP(text_node) != EXACTFA_NO_TRIE)
4110 /* Here, there could be something above Latin1 in the target
4111 * which folds to this character in the pattern. All such
4112 * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more
4113 * than two characters involved in their folds, so are outside
4114 * the scope of this function */
4115 if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
4116 c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
4119 use_chrtest_void = TRUE;
4122 else { /* Here nothing above Latin1 can fold to the pattern
4124 switch (OP(text_node)) {
4126 case EXACTFL: /* /l rules */
4127 c2 = PL_fold_locale[c1];
4130 case EXACTF: /* This node only generated for non-utf8
4132 assert(! is_utf8_pat);
4133 if (! utf8_target) { /* /d rules */
4138 /* /u rules for all these. This happens to work for
4139 * EXACTFA as nothing in Latin1 folds to ASCII */
4140 case EXACTFA_NO_TRIE: /* This node only generated for
4141 non-utf8 patterns */
4142 assert(! is_utf8_pat);
4147 c2 = PL_fold_latin1[c1];
4151 Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
4152 NOT_REACHED; /* NOTREACHED */
4158 /* Here have figured things out. Set up the returns */
4159 if (use_chrtest_void) {
4160 *c2p = *c1p = CHRTEST_VOID;
4162 else if (utf8_target) {
4163 if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
4164 uvchr_to_utf8(c1_utf8, c1);
4165 uvchr_to_utf8(c2_utf8, c2);
4168 /* Invariants are stored in both the utf8 and byte outputs; Use
4169 * negative numbers otherwise for the byte ones. Make sure that the
4170 * byte ones are the same iff the utf8 ones are the same */
4171 *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
4172 *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
4175 ? CHRTEST_NOT_A_CP_1
4176 : CHRTEST_NOT_A_CP_2;
4178 else if (c1 > 255) {
4179 if (c2 > 255) { /* both possibilities are above what a non-utf8 string
4184 *c1p = *c2p = c2; /* c2 is the only representable value */
4186 else { /* c1 is representable; see about c2 */
4188 *c2p = (c2 < 256) ? c2 : c1;
4194 /* This creates a single number by combining two, with 'before' being like the
4195 * 10's digit, but this isn't necessarily base 10; it is base however many
4196 * elements of the enum there are */
4197 #define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
4200 S_isGCB(const GCB_enum before, const GCB_enum after)
4202 /* returns a boolean indicating if there is a Grapheme Cluster Boundary
4203 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4205 switch (GCBcase(before, after)) {
4207 /* Break at the start and end of text.
4211 Break before and after controls except between CR and LF
4212 GB4. ( Control | CR | LF ) ÷
4213 GB5. ÷ ( Control | CR | LF )
4215 Otherwise, break everywhere.
4220 /* Do not break between a CR and LF.
4222 case GCBcase(GCB_CR, GCB_LF):
4225 /* Do not break Hangul syllable sequences.
4226 GB6. L × ( L | V | LV | LVT ) */
4227 case GCBcase(GCB_L, GCB_L):
4228 case GCBcase(GCB_L, GCB_V):
4229 case GCBcase(GCB_L, GCB_LV):
4230 case GCBcase(GCB_L, GCB_LVT):
4233 /* GB7. ( LV | V ) × ( V | T ) */
4234 case GCBcase(GCB_LV, GCB_V):
4235 case GCBcase(GCB_LV, GCB_T):
4236 case GCBcase(GCB_V, GCB_V):
4237 case GCBcase(GCB_V, GCB_T):
4240 /* GB8. ( LVT | T) × T */
4241 case GCBcase(GCB_LVT, GCB_T):
4242 case GCBcase(GCB_T, GCB_T):
4245 /* Do not break between regional indicator symbols.
4246 GB8a. Regional_Indicator × Regional_Indicator */
4247 case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
4250 /* Do not break before extending characters.
4252 case GCBcase(GCB_Other, GCB_Extend):
4253 case GCBcase(GCB_Extend, GCB_Extend):
4254 case GCBcase(GCB_L, GCB_Extend):
4255 case GCBcase(GCB_LV, GCB_Extend):
4256 case GCBcase(GCB_LVT, GCB_Extend):
4257 case GCBcase(GCB_Prepend, GCB_Extend):
4258 case GCBcase(GCB_Regional_Indicator, GCB_Extend):
4259 case GCBcase(GCB_SpacingMark, GCB_Extend):
4260 case GCBcase(GCB_T, GCB_Extend):
4261 case GCBcase(GCB_V, GCB_Extend):
4264 /* Do not break before SpacingMarks, or after Prepend characters.
4265 GB9a. × SpacingMark */
4266 case GCBcase(GCB_Other, GCB_SpacingMark):
4267 case GCBcase(GCB_Extend, GCB_SpacingMark):
4268 case GCBcase(GCB_L, GCB_SpacingMark):
4269 case GCBcase(GCB_LV, GCB_SpacingMark):
4270 case GCBcase(GCB_LVT, GCB_SpacingMark):
4271 case GCBcase(GCB_Prepend, GCB_SpacingMark):
4272 case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
4273 case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
4274 case GCBcase(GCB_T, GCB_SpacingMark):
4275 case GCBcase(GCB_V, GCB_SpacingMark):
4278 /* GB9b. Prepend × */
4279 case GCBcase(GCB_Prepend, GCB_Other):
4280 case GCBcase(GCB_Prepend, GCB_L):
4281 case GCBcase(GCB_Prepend, GCB_LV):
4282 case GCBcase(GCB_Prepend, GCB_LVT):
4283 case GCBcase(GCB_Prepend, GCB_Prepend):
4284 case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
4285 case GCBcase(GCB_Prepend, GCB_T):
4286 case GCBcase(GCB_Prepend, GCB_V):
4290 NOT_REACHED; /* NOTREACHED */
4293 #define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
4296 S_isSB(pTHX_ SB_enum before,
4298 const U8 * const strbeg,
4299 const U8 * const curpos,
4300 const U8 * const strend,
4301 const bool utf8_target)
4303 /* returns a boolean indicating if there is a Sentence Boundary Break
4304 * between the inputs. See http://www.unicode.org/reports/tr29/ */
4306 U8 * lpos = (U8 *) curpos;
4310 PERL_ARGS_ASSERT_ISSB;
4312 /* Break at the start and end of text.
4315 if (before == SB_EDGE || after == SB_EDGE) {
4319 /* SB 3: Do not break within CRLF. */
4320 if (before == SB_CR && after == SB_LF) {
4324 /* Break after paragraph separators. (though why CR and LF are considered
4325 * so is beyond me (khw)
4326 SB4. Sep | CR | LF ÷ */
4327 if (before == SB_Sep || before == SB_CR || before == SB_LF) {
4331 /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
4332 * (See Section 6.2, Replacing Ignore Rules.)
4333 SB5. X (Extend | Format)* → X */
4334 if (after == SB_Extend || after == SB_Format) {
4338 if (before == SB_Extend || before == SB_Format) {
4339 before = backup_one_SB(strbeg, &lpos, utf8_target);
4342 /* Do not break after ambiguous terminators like period, if they are
4343 * immediately followed by a number or lowercase letter, if they are
4344 * between uppercase letters, if the first following letter (optionally
4345 * after certain punctuation) is lowercase, or if they are followed by
4346 * "continuation" punctuation such as comma, colon, or semicolon. For
4347 * example, a period may be an abbreviation or numeric period, and thus may
4348 * not mark the end of a sentence.
4350 * SB6. ATerm × Numeric */
4351 if (before == SB_ATerm && after == SB_Numeric) {
4355 /* SB7. Upper ATerm × Upper */
4356 if (before == SB_ATerm && after == SB_Upper) {
4358 if (SB_Upper == backup_one_SB(strbeg, &temp_pos, utf8_target)) {
4363 /* SB8a. (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
4364 * SB10. (STerm | ATerm) Close* Sp* × ( Sp | Sep | CR | LF ) */
4367 while (backup == SB_Sp) {
4368 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4370 while (backup == SB_Close) {
4371 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4373 if ((backup == SB_STerm || backup == SB_ATerm)
4374 && ( after == SB_SContinue
4375 || after == SB_STerm
4376 || after == SB_ATerm
4385 /* SB8. ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
4386 * STerm | ATerm) )* Lower */
4387 if (backup == SB_ATerm) {
4388 U8 * rpos = (U8 *) curpos;
4389 SB_enum later = after;
4391 while ( later != SB_OLetter
4392 && later != SB_Upper
4393 && later != SB_Lower
4397 && later != SB_STerm
4398 && later != SB_ATerm
4399 && later != SB_EDGE)
4401 later = advance_one_SB(&rpos, strend, utf8_target);
4403 if (later == SB_Lower) {
4408 /* Break after sentence terminators, but include closing punctuation,
4409 * trailing spaces, and a paragraph separator (if present). [See note
4411 * SB9. ( STerm | ATerm ) Close* × ( Close | Sp | Sep | CR | LF ) */
4414 while (backup == SB_Close) {
4415 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4417 if ((backup == SB_STerm || backup == SB_ATerm)
4418 && ( after == SB_Close
4428 /* SB11. ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )? ÷ */
4430 backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
4431 if ( backup == SB_Sep
4440 while (backup == SB_Sp) {
4441 backup = backup_one_SB(strbeg, &lpos, utf8_target);
4443 while (backup == SB_Close) {
4444 backup = backup_one_SB(strbeg, &lpos, utf8_target);
4446 if (backup == SB_STerm || backup == SB_ATerm) {
4450 /* Otherwise, do not break.
4457 S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4461 PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
4463 if (*curpos >= strend) {
4469 *curpos += UTF8SKIP(*curpos);
4470 if (*curpos >= strend) {
4473 sb = getSB_VAL_UTF8(*curpos, strend);
4474 } while (sb == SB_Extend || sb == SB_Format);
4479 if (*curpos >= strend) {
4482 sb = getSB_VAL_CP(**curpos);
4483 } while (sb == SB_Extend || sb == SB_Format);
4490 S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4494 PERL_ARGS_ASSERT_BACKUP_ONE_SB;
4496 if (*curpos < strbeg) {
4501 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4502 if (! prev_char_pos) {
4506 /* Back up over Extend and Format. curpos is always just to the right
4507 * of the characater whose value we are getting */
4509 U8 * prev_prev_char_pos;
4510 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
4513 sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4514 *curpos = prev_char_pos;
4515 prev_char_pos = prev_prev_char_pos;
4518 *curpos = (U8 *) strbeg;
4521 } while (sb == SB_Extend || sb == SB_Format);
4525 if (*curpos - 2 < strbeg) {
4526 *curpos = (U8 *) strbeg;
4530 sb = getSB_VAL_CP(*(*curpos - 1));
4531 } while (sb == SB_Extend || sb == SB_Format);
4537 #define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
4540 S_isWB(pTHX_ WB_enum previous,
4543 const U8 * const strbeg,
4544 const U8 * const curpos,
4545 const U8 * const strend,
4546 const bool utf8_target)
4548 /* Return a boolean as to if the boundary between 'before' and 'after' is
4549 * a Unicode word break, using their published algorithm. Context may be
4550 * needed to make this determination. If the value for the character
4551 * before 'before' is known, it is passed as 'previous'; otherwise that
4552 * should be set to WB_UNKNOWN. The other input parameters give the
4553 * boundaries and current position in the matching of the string. That
4554 * is, 'curpos' marks the position where the character whose wb value is
4555 * 'after' begins. See http://www.unicode.org/reports/tr29/ */
4557 U8 * before_pos = (U8 *) curpos;
4558 U8 * after_pos = (U8 *) curpos;
4560 PERL_ARGS_ASSERT_ISWB;
4562 /* WB1 and WB2: Break at the start and end of text. */
4563 if (before == WB_EDGE || after == WB_EDGE) {
4567 /* WB 3: Do not break within CRLF. */
4568 if (before == WB_CR && after == WB_LF) {
4572 /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
4574 if ( before == WB_CR || before == WB_LF || before == WB_Newline
4575 || after == WB_CR || after == WB_LF || after == WB_Newline)
4580 /* Ignore Format and Extend characters, except when they appear at the
4581 * beginning of a region of text.
4582 * WB4. X (Extend | Format)* → X. */
4584 if (after == WB_Extend || after == WB_Format) {
4588 if (before == WB_Extend || before == WB_Format) {
4589 before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4592 switch (WBcase(before, after)) {
4593 /* Otherwise, break everywhere (including around ideographs).
4598 /* Do not break between most letters.
4599 WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
4600 case WBcase(WB_ALetter, WB_ALetter):
4601 case WBcase(WB_ALetter, WB_Hebrew_Letter):
4602 case WBcase(WB_Hebrew_Letter, WB_ALetter):
4603 case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
4606 /* Do not break letters across certain punctuation.
4607 WB6. (ALetter | Hebrew_Letter)
4608 × (MidLetter | MidNumLet | Single_Quote) (ALetter
4610 case WBcase(WB_ALetter, WB_MidLetter):
4611 case WBcase(WB_ALetter, WB_MidNumLet):
4612 case WBcase(WB_ALetter, WB_Single_Quote):
4613 case WBcase(WB_Hebrew_Letter, WB_MidLetter):
4614 case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
4615 /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
4616 after = advance_one_WB(&after_pos, strend, utf8_target);
4617 return after != WB_ALetter && after != WB_Hebrew_Letter;
4619 /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
4620 * Single_Quote) × (ALetter | Hebrew_Letter) */
4621 case WBcase(WB_MidLetter, WB_ALetter):
4622 case WBcase(WB_MidLetter, WB_Hebrew_Letter):
4623 case WBcase(WB_MidNumLet, WB_ALetter):
4624 case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
4625 case WBcase(WB_Single_Quote, WB_ALetter):
4626 case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
4628 = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
4629 return before != WB_ALetter && before != WB_Hebrew_Letter;
4631 /* WB7a. Hebrew_Letter × Single_Quote */
4632 case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
4635 /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
4636 case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
4637 return advance_one_WB(&after_pos, strend, utf8_target)
4638 != WB_Hebrew_Letter;
4640 /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
4641 case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
4642 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4643 != WB_Hebrew_Letter;
4645 /* Do not break within sequences of digits, or digits adjacent to
4646 * letters (“3a”, or “A3”).
4647 WB8. Numeric × Numeric */
4648 case WBcase(WB_Numeric, WB_Numeric):
4651 /* WB9. (ALetter | Hebrew_Letter) × Numeric */
4652 case WBcase(WB_ALetter, WB_Numeric):
4653 case WBcase(WB_Hebrew_Letter, WB_Numeric):
4656 /* WB10. Numeric × (ALetter | Hebrew_Letter) */
4657 case WBcase(WB_Numeric, WB_ALetter):
4658 case WBcase(WB_Numeric, WB_Hebrew_Letter):
4661 /* Do not break within sequences, such as “3.2” or “3,456.789”.
4662 WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
4664 case WBcase(WB_MidNum, WB_Numeric):
4665 case WBcase(WB_MidNumLet, WB_Numeric):
4666 case WBcase(WB_Single_Quote, WB_Numeric):
4667 return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
4670 /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
4672 case WBcase(WB_Numeric, WB_MidNum):
4673 case WBcase(WB_Numeric, WB_MidNumLet):
4674 case WBcase(WB_Numeric, WB_Single_Quote):
4675 return advance_one_WB(&after_pos, strend, utf8_target)
4678 /* Do not break between Katakana.
4679 WB13. Katakana × Katakana */
4680 case WBcase(WB_Katakana, WB_Katakana):
4683 /* Do not break from extenders.
4684 WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
4685 ExtendNumLet) × ExtendNumLet */
4686 case WBcase(WB_ALetter, WB_ExtendNumLet):
4687 case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
4688 case WBcase(WB_Numeric, WB_ExtendNumLet):
4689 case WBcase(WB_Katakana, WB_ExtendNumLet):
4690 case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
4693 /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
4695 case WBcase(WB_ExtendNumLet, WB_ALetter):
4696 case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
4697 case WBcase(WB_ExtendNumLet, WB_Numeric):
4698 case WBcase(WB_ExtendNumLet, WB_Katakana):
4701 /* Do not break between regional indicator symbols.
4702 WB13c. Regional_Indicator × Regional_Indicator */
4703 case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
4708 NOT_REACHED; /* NOTREACHED */
4712 S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
4716 PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
4718 if (*curpos >= strend) {
4724 /* Advance over Extend and Format */
4726 *curpos += UTF8SKIP(*curpos);
4727 if (*curpos >= strend) {
4730 wb = getWB_VAL_UTF8(*curpos, strend);
4731 } while (wb == WB_Extend || wb == WB_Format);
4736 if (*curpos >= strend) {
4739 wb = getWB_VAL_CP(**curpos);
4740 } while (wb == WB_Extend || wb == WB_Format);
4747 S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
4751 PERL_ARGS_ASSERT_BACKUP_ONE_WB;
4753 /* If we know what the previous character's break value is, don't have
4755 if (*previous != WB_UNKNOWN) {
4757 *previous = WB_UNKNOWN;
4758 /* XXX Note that doesn't change curpos, and maybe should */
4760 /* But we always back up over these two types */
4761 if (wb != WB_Extend && wb != WB_Format) {
4766 if (*curpos < strbeg) {
4771 U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
4772 if (! prev_char_pos) {
4776 /* Back up over Extend and Format. curpos is always just to the right
4777 * of the characater whose value we are getting */
4779 U8 * prev_prev_char_pos;
4780 if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos,
4784 wb = getWB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
4785 *curpos = prev_char_pos;
4786 prev_char_pos = prev_prev_char_pos;
4789 *curpos = (U8 *) strbeg;
4792 } while (wb == WB_Extend || wb == WB_Format);
4796 if (*curpos - 2 < strbeg) {
4797 *curpos = (U8 *) strbeg;
4801 wb = getWB_VAL_CP(*(*curpos - 1));
4802 } while (wb == WB_Extend || wb == WB_Format);
4808 /* returns -1 on failure, $+[0] on success */
4810 S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
4812 #if PERL_VERSION < 9 && !defined(PERL_CORE)
4816 const bool utf8_target = reginfo->is_utf8_target;
4817 const U32 uniflags = UTF8_ALLOW_DEFAULT;
4818 REGEXP *rex_sv = reginfo->prog;
4819 regexp *rex = ReANY(rex_sv);
4820 RXi_GET_DECL(rex,rexi);
4821 /* the current state. This is a cached copy of PL_regmatch_state */
4823 /* cache heavy used fields of st in registers */
4826 U32 n = 0; /* general value; init to avoid compiler warning */
4827 SSize_t ln = 0; /* len or last; init to avoid compiler warning */
4828 char *locinput = startpos;
4829 char *pushinput; /* where to continue after a PUSH */
4830 I32 nextchr; /* is always set to UCHARAT(locinput) */
4832 bool result = 0; /* return value of S_regmatch */
4833 int depth = 0; /* depth of backtrack stack */
4834 U32 nochange_depth = 0; /* depth of GOSUB recursion with nochange */
4835 const U32 max_nochange_depth =
4836 (3 * rex->nparens > MAX_RECURSE_EVAL_NOCHANGE_DEPTH) ?
4837 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH;
4838 regmatch_state *yes_state = NULL; /* state to pop to on success of
4840 /* mark_state piggy backs on the yes_state logic so that when we unwind
4841 the stack on success we can update the mark_state as we go */
4842 regmatch_state *mark_state = NULL; /* last mark state we have seen */
4843 regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */
4844 struct regmatch_state *cur_curlyx = NULL; /* most recent curlyx */
4846 bool no_final = 0; /* prevent failure from backtracking? */
4847 bool do_cutgroup = 0; /* no_final only until next branch/trie entry */
4848 char *startpoint = locinput;
4849 SV *popmark = NULL; /* are we looking for a mark? */
4850 SV *sv_commit = NULL; /* last mark name seen in failure */
4851 SV *sv_yes_mark = NULL; /* last mark name we have seen
4852 during a successful match */
4853 U32 lastopen = 0; /* last open we saw */
4854 bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0;
4855 SV* const oreplsv = GvSVn(PL_replgv);
4856 /* these three flags are set by various ops to signal information to
4857 * the very next op. They have a useful lifetime of exactly one loop
4858 * iteration, and are not preserved or restored by state pushes/pops
4860 bool sw = 0; /* the condition value in (?(cond)a|b) */
4861 bool minmod = 0; /* the next "{n,m}" is a "{n,m}?" */
4862 int logical = 0; /* the following EVAL is:
4866 or the following IFMATCH/UNLESSM is:
4867 false: plain (?=foo)
4868 true: used as a condition: (?(?=foo))
4870 PAD* last_pad = NULL;
4872 I32 gimme = G_SCALAR;
4873 CV *caller_cv = NULL; /* who called us */
4874 CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */
4875 CHECKPOINT runops_cp; /* savestack position before executing EVAL */
4876 U32 maxopenparen = 0; /* max '(' index seen so far */
4877 int to_complement; /* Invert the result? */
4878 _char_class_number classnum;
4879 bool is_utf8_pat = reginfo->is_utf8_pat;
4884 GET_RE_DEBUG_FLAGS_DECL;
4887 /* protect against undef(*^R) */
4888 SAVEFREESV(SvREFCNT_inc_simple_NN(oreplsv));
4890 /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */
4891 multicall_oldcatch = 0;
4892 multicall_cv = NULL;
4894 PERL_UNUSED_VAR(multicall_cop);
4895 PERL_UNUSED_VAR(newsp);
4898 PERL_ARGS_ASSERT_REGMATCH;
4900 DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({
4901 PerlIO_printf(Perl_debug_log,"regmatch start\n");
4904 st = PL_regmatch_state;
4906 /* Note that nextchr is a byte even in UTF */
4909 while (scan != NULL) {
4912 SV * const prop = sv_newmortal();
4913 regnode *rnext=regnext(scan);
4914 DUMP_EXEC_POS( locinput, scan, utf8_target );
4915 regprop(rex, prop, scan, reginfo, NULL);
4917 PerlIO_printf(Perl_debug_log,
4918 "%3"IVdf":%*s%s(%"IVdf")\n",
4919 (IV)(scan - rexi->program), depth*2, "",
4921 (PL_regkind[OP(scan)] == END || !rnext) ?
4922 0 : (IV)(rnext - rexi->program));
4925 next = scan + NEXT_OFF(scan);
4928 state_num = OP(scan);
4930 REH_CALL_EXEC_NODE_HOOK(rex, scan, reginfo, st);
4935 assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
4937 switch (state_num) {
4938 case SBOL: /* /^../ and /\A../ */
4939 if (locinput == reginfo->strbeg)
4943 case MBOL: /* /^../m */
4944 if (locinput == reginfo->strbeg ||
4945 (!NEXTCHR_IS_EOS && locinput[-1] == '\n'))
4952 if (locinput == reginfo->ganch)
4956 case KEEPS: /* \K */
4957 /* update the startpoint */
4958 st->u.keeper.val = rex->offs[0].start;
4959 rex->offs[0].start = locinput - reginfo->strbeg;
4960 PUSH_STATE_GOTO(KEEPS_next, next, locinput);
4962 NOT_REACHED; /* NOTREACHED */
4964 case KEEPS_next_fail:
4965 /* rollback the start point change */
4966 rex->offs[0].start = st->u.keeper.val;
4969 NOT_REACHED; /* NOTREACHED */
4971 case MEOL: /* /..$/m */
4972 if (!NEXTCHR_IS_EOS && nextchr != '\n')
4976 case SEOL: /* /..$/ */
4977 if (!NEXTCHR_IS_EOS && nextchr != '\n')
4979 if (reginfo->strend - locinput > 1)
4984 if (!NEXTCHR_IS_EOS)
4988 case SANY: /* /./s */
4991 goto increment_locinput;
4999 case REG_ANY: /* /./ */
5000 if ((NEXTCHR_IS_EOS) || nextchr == '\n')
5002 goto increment_locinput;
5006 #define ST st->u.trie
5007 case TRIEC: /* (ab|cd) with known charclass */
5008 /* In this case the charclass data is available inline so
5009 we can fail fast without a lot of extra overhead.
5011 if(!NEXTCHR_IS_EOS && !ANYOF_BITMAP_TEST(scan, nextchr)) {
5013 PerlIO_printf(Perl_debug_log,
5014 "%*s %sfailed to match trie start class...%s\n",
5015 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5019 NOT_REACHED; /* NOTREACHED */
5022 case TRIE: /* (ab|cd) */
5023 /* the basic plan of execution of the trie is:
5024 * At the beginning, run though all the states, and
5025 * find the longest-matching word. Also remember the position
5026 * of the shortest matching word. For example, this pattern:
5029 * when matched against the string "abcde", will generate
5030 * accept states for all words except 3, with the longest
5031 * matching word being 4, and the shortest being 2 (with
5032 * the position being after char 1 of the string).
5034 * Then for each matching word, in word order (i.e. 1,2,4,5),
5035 * we run the remainder of the pattern; on each try setting
5036 * the current position to the character following the word,
5037 * returning to try the next word on failure.
5039 * We avoid having to build a list of words at runtime by
5040 * using a compile-time structure, wordinfo[].prev, which
5041 * gives, for each word, the previous accepting word (if any).
5042 * In the case above it would contain the mappings 1->2, 2->0,
5043 * 3->0, 4->5, 5->1. We can use this table to generate, from
5044 * the longest word (4 above), a list of all words, by
5045 * following the list of prev pointers; this gives us the
5046 * unordered list 4,5,1,2. Then given the current word we have
5047 * just tried, we can go through the list and find the
5048 * next-biggest word to try (so if we just failed on word 2,
5049 * the next in the list is 4).
5051 * Since at runtime we don't record the matching position in
5052 * the string for each word, we have to work that out for
5053 * each word we're about to process. The wordinfo table holds
5054 * the character length of each word; given that we recorded
5055 * at the start: the position of the shortest word and its
5056 * length in chars, we just need to move the pointer the
5057 * difference between the two char lengths. Depending on
5058 * Unicode status and folding, that's cheap or expensive.
5060 * This algorithm is optimised for the case where are only a
5061 * small number of accept states, i.e. 0,1, or maybe 2.
5062 * With lots of accepts states, and having to try all of them,
5063 * it becomes quadratic on number of accept states to find all
5068 /* what type of TRIE am I? (utf8 makes this contextual) */
5069 DECL_TRIE_TYPE(scan);
5071 /* what trie are we using right now */
5072 reg_trie_data * const trie
5073 = (reg_trie_data*)rexi->data->data[ ARG( scan ) ];
5074 HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
5075 U32 state = trie->startstate;
5077 if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
5078 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5080 && UTF8_IS_ABOVE_LATIN1(nextchr)
5081 && scan->flags == EXACTL)
5083 /* We only output for EXACTL, as we let the folder
5084 * output this message for EXACTFLU8 to avoid
5086 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
5091 && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
5093 if (trie->states[ state ].wordnum) {
5095 PerlIO_printf(Perl_debug_log,
5096 "%*s %smatched empty string...%s\n",
5097 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5103 PerlIO_printf(Perl_debug_log,
5104 "%*s %sfailed to match trie start class...%s\n",
5105 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5])
5112 U8 *uc = ( U8* )locinput;
5116 U8 *uscan = (U8*)NULL;
5117 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
5118 U32 charcount = 0; /* how many input chars we have matched */
5119 U32 accepted = 0; /* have we seen any accepting states? */
5121 ST.jump = trie->jump;
5124 ST.longfold = FALSE; /* char longer if folded => it's harder */
5127 /* fully traverse the TRIE; note the position of the
5128 shortest accept state and the wordnum of the longest
5131 while ( state && uc <= (U8*)(reginfo->strend) ) {
5132 U32 base = trie->states[ state ].trans.base;
5136 wordnum = trie->states[ state ].wordnum;
5138 if (wordnum) { /* it's an accept state */
5141 /* record first match position */
5143 ST.firstpos = (U8*)locinput;
5148 ST.firstchars = charcount;
5151 if (!ST.nextword || wordnum < ST.nextword)
5152 ST.nextword = wordnum;
5153 ST.topword = wordnum;
5156 DEBUG_TRIE_EXECUTE_r({
5157 DUMP_EXEC_POS( (char *)uc, scan, utf8_target );
5158 PerlIO_printf( Perl_debug_log,
5159 "%*s %sState: %4"UVxf" Accepted: %c ",
5160 2+depth * 2, "", PL_colors[4],
5161 (UV)state, (accepted ? 'Y' : 'N'));
5164 /* read a char and goto next state */
5165 if ( base && (foldlen || uc < (U8*)(reginfo->strend))) {
5167 REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc,
5168 uscan, len, uvc, charid, foldlen,
5175 base + charid - 1 - trie->uniquecharcount)) >= 0)
5177 && ((U32)offset < trie->lasttrans)
5178 && trie->trans[offset].check == state)
5180 state = trie->trans[offset].next;
5191 DEBUG_TRIE_EXECUTE_r(
5192 PerlIO_printf( Perl_debug_log,
5193 "Charid:%3x CP:%4"UVxf" After State: %4"UVxf"%s\n",
5194 charid, uvc, (UV)state, PL_colors[5] );
5200 /* calculate total number of accept states */
5205 w = trie->wordinfo[w].prev;
5208 ST.accepted = accepted;
5212 PerlIO_printf( Perl_debug_log,
5213 "%*s %sgot %"IVdf" possible matches%s\n",
5214 REPORT_CODE_OFF + depth * 2, "",
5215 PL_colors[4], (IV)ST.accepted, PL_colors[5] );
5217 goto trie_first_try; /* jump into the fail handler */
5220 NOT_REACHED; /* NOTREACHED */
5222 case TRIE_next_fail: /* we failed - try next alternative */
5226 REGCP_UNWIND(ST.cp);
5227 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
5229 if (!--ST.accepted) {
5231 PerlIO_printf( Perl_debug_log,
5232 "%*s %sTRIE failed...%s\n",
5233 REPORT_CODE_OFF+depth*2, "",
5240 /* Find next-highest word to process. Note that this code
5241 * is O(N^2) per trie run (O(N) per branch), so keep tight */
5244 U16 const nextword = ST.nextword;
5245 reg_trie_wordinfo * const wordinfo
5246 = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo;
5247 for (word=ST.topword; word; word=wordinfo[word].prev) {
5248 if (word > nextword && (!min || word < min))
5261 ST.lastparen = rex->lastparen;
5262 ST.lastcloseparen = rex->lastcloseparen;
5266 /* find start char of end of current word */
5268 U32 chars; /* how many chars to skip */
5269 reg_trie_data * const trie
5270 = (reg_trie_data*)rexi->data->data[ARG(ST.me)];
5272 assert((trie->wordinfo[ST.nextword].len - trie->prefixlen)
5274 chars = (trie->wordinfo[ST.nextword].len - trie->prefixlen)
5279 /* the hard option - fold each char in turn and find
5280 * its folded length (which may be different */
5281 U8 foldbuf[UTF8_MAXBYTES_CASE + 1];
5289 uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len,
5297 uvc = to_uni_fold(uvc, foldbuf, &foldlen);
5302 uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len,
5318 scan = ST.me + ((ST.jump && ST.jump[ST.nextword])
5319 ? ST.jump[ST.nextword]
5323 PerlIO_printf( Perl_debug_log,
5324 "%*s %sTRIE matched word #%d, continuing%s\n",
5325 REPORT_CODE_OFF+depth*2, "",
5332 if (ST.accepted > 1 || has_cutgroup) {
5333 PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
5335 NOT_REACHED; /* NOTREACHED */
5337 /* only one choice left - just continue */
5339 AV *const trie_words
5340 = MUTABLE_AV(rexi->data->data[ARG(ST.me)+TRIE_WORDS_OFFSET]);
5341 SV ** const tmp = trie_words
5342 ? av_fetch(trie_words, ST.nextword - 1, 0) : NULL;
5343 SV *sv= tmp ? sv_newmortal() : NULL;
5345 PerlIO_printf( Perl_debug_log,
5346 "%*s %sonly one match left, short-circuiting: #%d <%s>%s\n",
5347 REPORT_CODE_OFF+depth*2, "", PL_colors[4],
5349 tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0,
5350 PL_colors[0], PL_colors[1],
5351 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII
5353 : "not compiled under -Dr",
5357 locinput = (char*)uc;
5358 continue; /* execute rest of RE */
5363 case EXACTL: /* /abc/l */
5364 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5366 /* Complete checking would involve going through every character
5367 * matched by the string to see if any is above latin1. But the
5368 * comparision otherwise might very well be a fast assembly
5369 * language routine, and I (khw) don't think slowing things down
5370 * just to check for this warning is worth it. So this just checks
5371 * the first character */
5372 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
5373 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5376 case EXACT: { /* /abc/ */
5377 char *s = STRING(scan);
5379 if (utf8_target != is_utf8_pat) {
5380 /* The target and the pattern have differing utf8ness. */
5382 const char * const e = s + ln;
5385 /* The target is utf8, the pattern is not utf8.
5386 * Above-Latin1 code points can't match the pattern;
5387 * invariants match exactly, and the other Latin1 ones need
5388 * to be downgraded to a single byte in order to do the
5389 * comparison. (If we could be confident that the target
5390 * is not malformed, this could be refactored to have fewer
5391 * tests by just assuming that if the first bytes match, it
5392 * is an invariant, but there are tests in the test suite
5393 * dealing with (??{...}) which violate this) */
5395 if (l >= reginfo->strend
5396 || UTF8_IS_ABOVE_LATIN1(* (U8*) l))
5400 if (UTF8_IS_INVARIANT(*(U8*)l)) {
5407 if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
5417 /* The target is not utf8, the pattern is utf8. */
5419 if (l >= reginfo->strend
5420 || UTF8_IS_ABOVE_LATIN1(* (U8*) s))
5424 if (UTF8_IS_INVARIANT(*(U8*)s)) {
5431 if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
5443 /* The target and the pattern have the same utf8ness. */
5444 /* Inline the first character, for speed. */
5445 if (reginfo->strend - locinput < ln
5446 || UCHARAT(s) != nextchr
5447 || (ln > 1 && memNE(s, locinput, ln)))
5456 case EXACTFL: { /* /abc/il */
5458 const U8 * fold_array;
5460 U32 fold_utf8_flags;
5462 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5463 folder = foldEQ_locale;
5464 fold_array = PL_fold_locale;
5465 fold_utf8_flags = FOLDEQ_LOCALE;
5468 case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
5469 is effectively /u; hence to match, target
5471 if (! utf8_target) {
5474 fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
5475 | FOLDEQ_S1_FOLDS_SANE;
5476 folder = foldEQ_latin1;
5477 fold_array = PL_fold_latin1;
5480 case EXACTFU_SS: /* /\x{df}/iu */
5481 case EXACTFU: /* /abc/iu */
5482 folder = foldEQ_latin1;
5483 fold_array = PL_fold_latin1;
5484 fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0;
5487 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8
5489 assert(! is_utf8_pat);
5491 case EXACTFA: /* /abc/iaa */
5492 folder = foldEQ_latin1;
5493 fold_array = PL_fold_latin1;
5494 fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
5497 case EXACTF: /* /abc/i This node only generated for
5498 non-utf8 patterns */
5499 assert(! is_utf8_pat);
5501 fold_array = PL_fold;
5502 fold_utf8_flags = 0;
5510 || state_num == EXACTFU_SS
5511 || (state_num == EXACTFL && IN_UTF8_CTYPE_LOCALE))
5513 /* Either target or the pattern are utf8, or has the issue where
5514 * the fold lengths may differ. */
5515 const char * const l = locinput;
5516 char *e = reginfo->strend;
5518 if (! foldEQ_utf8_flags(s, 0, ln, is_utf8_pat,
5519 l, &e, 0, utf8_target, fold_utf8_flags))
5527 /* Neither the target nor the pattern are utf8 */
5528 if (UCHARAT(s) != nextchr
5530 && UCHARAT(s) != fold_array[nextchr])
5534 if (reginfo->strend - locinput < ln)
5536 if (ln > 1 && ! folder(s, locinput, ln))
5542 case NBOUNDL: /* /\B/l */
5546 case BOUNDL: /* /\b/l */
5547 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5549 if (FLAGS(scan) != TRADITIONAL_BOUND) {
5550 if (! IN_UTF8_CTYPE_LOCALE) {
5551 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
5552 B_ON_NON_UTF8_LOCALE_IS_WRONG);
5558 if (locinput == reginfo->strbeg)
5559 ln = isWORDCHAR_LC('\n');
5561 ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
5562 (U8*)(reginfo->strbeg)));
5564 n = (NEXTCHR_IS_EOS)
5565 ? isWORDCHAR_LC('\n')
5566 : isWORDCHAR_LC_utf8((U8*)locinput);
5568 else { /* Here the string isn't utf8 */
5569 ln = (locinput == reginfo->strbeg)
5570 ? isWORDCHAR_LC('\n')
5571 : isWORDCHAR_LC(UCHARAT(locinput - 1));
5572 n = (NEXTCHR_IS_EOS)
5573 ? isWORDCHAR_LC('\n')
5574 : isWORDCHAR_LC(nextchr);
5576 if (to_complement ^ (ln == n)) {
5581 case NBOUND: /* /\B/ */
5585 case BOUND: /* /\b/ */
5589 goto bound_ascii_match_only;
5591 case NBOUNDA: /* /\B/a */
5595 case BOUNDA: /* /\b/a */
5597 bound_ascii_match_only:
5598 /* Here the string isn't utf8, or is utf8 and only ascii characters
5599 * are to match \w. In the latter case looking at the byte just
5600 * prior to the current one may be just the final byte of a
5601 * multi-byte character. This is ok. There are two cases:
5602 * 1) it is a single byte character, and then the test is doing
5603 * just what it's supposed to.
5604 * 2) it is a multi-byte character, in which case the final byte is
5605 * never mistakable for ASCII, and so the test will say it is
5606 * not a word character, which is the correct answer. */
5607 ln = (locinput == reginfo->strbeg)
5608 ? isWORDCHAR_A('\n')
5609 : isWORDCHAR_A(UCHARAT(locinput - 1));
5610 n = (NEXTCHR_IS_EOS)
5611 ? isWORDCHAR_A('\n')
5612 : isWORDCHAR_A(nextchr);
5613 if (to_complement ^ (ln == n)) {
5618 case NBOUNDU: /* /\B/u */
5622 case BOUNDU: /* /\b/u */
5628 switch((bound_type) FLAGS(scan)) {
5629 case TRADITIONAL_BOUND:
5630 ln = (locinput == reginfo->strbeg)
5631 ? 0 /* isWORDCHAR_L1('\n') */
5632 : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
5633 (U8*)(reginfo->strbeg)));
5634 n = (NEXTCHR_IS_EOS)
5635 ? 0 /* isWORDCHAR_L1('\n') */
5636 : isWORDCHAR_utf8((U8*)locinput);
5637 match = cBOOL(ln != n);
5640 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5641 match = TRUE; /* GCB always matches at begin and
5645 /* Find the gcb values of previous and current
5646 * chars, then see if is a break point */
5647 match = isGCB(getGCB_VAL_UTF8(
5648 reghop3((U8*)locinput,
5650 (U8*)(reginfo->strbeg)),
5651 (U8*) reginfo->strend),
5652 getGCB_VAL_UTF8((U8*) locinput,
5653 (U8*) reginfo->strend));
5657 case SB_BOUND: /* Always matches at begin and end */
5658 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5662 match = isSB(getSB_VAL_UTF8(
5663 reghop3((U8*)locinput,
5665 (U8*)(reginfo->strbeg)),
5666 (U8*) reginfo->strend),
5667 getSB_VAL_UTF8((U8*) locinput,
5668 (U8*) reginfo->strend),
5669 (U8*) reginfo->strbeg,
5671 (U8*) reginfo->strend,
5677 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5681 match = isWB(WB_UNKNOWN,
5683 reghop3((U8*)locinput,
5685 (U8*)(reginfo->strbeg)),
5686 (U8*) reginfo->strend),
5687 getWB_VAL_UTF8((U8*) locinput,
5688 (U8*) reginfo->strend),
5689 (U8*) reginfo->strbeg,
5691 (U8*) reginfo->strend,
5697 else { /* Not utf8 target */
5698 switch((bound_type) FLAGS(scan)) {
5699 case TRADITIONAL_BOUND:
5700 ln = (locinput == reginfo->strbeg)
5701 ? 0 /* isWORDCHAR_L1('\n') */
5702 : isWORDCHAR_L1(UCHARAT(locinput - 1));
5703 n = (NEXTCHR_IS_EOS)
5704 ? 0 /* isWORDCHAR_L1('\n') */
5705 : isWORDCHAR_L1(nextchr);
5706 match = cBOOL(ln != n);
5710 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5711 match = TRUE; /* GCB always matches at begin and
5714 else { /* Only CR-LF combo isn't a GCB in 0-255
5716 match = UCHARAT(locinput - 1) != '\r'
5717 || UCHARAT(locinput) != '\n';
5721 case SB_BOUND: /* Always matches at begin and end */
5722 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5726 match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
5727 getSB_VAL_CP(UCHARAT(locinput)),
5728 (U8*) reginfo->strbeg,
5730 (U8*) reginfo->strend,
5736 if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
5740 match = isWB(WB_UNKNOWN,
5741 getWB_VAL_CP(UCHARAT(locinput -1)),
5742 getWB_VAL_CP(UCHARAT(locinput)),
5743 (U8*) reginfo->strbeg,
5745 (U8*) reginfo->strend,
5752 if (to_complement ^ ! match) {
5757 case ANYOFL: /* /[abc]/l */
5758 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5760 case ANYOF: /* /[abc]/ */
5764 if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
5767 locinput += UTF8SKIP(locinput);
5770 if (!REGINCLASS(rex, scan, (U8*)locinput))
5776 /* The argument (FLAGS) to all the POSIX node types is the class number
5779 case NPOSIXL: /* \W or [:^punct:] etc. under /l */
5783 case POSIXL: /* \w or [:punct:] etc. under /l */
5784 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
5788 /* Use isFOO_lc() for characters within Latin1. (Note that
5789 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5790 * wouldn't be invariant) */
5791 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5792 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) {
5796 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5797 if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
5798 (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5799 *(locinput + 1))))))
5804 else { /* Here, must be an above Latin-1 code point */
5805 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
5806 goto utf8_posix_above_latin1;
5809 /* Here, must be utf8 */
5810 locinput += UTF8SKIP(locinput);
5813 case NPOSIXD: /* \W or [:^punct:] etc. under /d */
5817 case POSIXD: /* \w or [:punct:] etc. under /d */
5823 case NPOSIXA: /* \W or [:^punct:] etc. under /a */
5825 if (NEXTCHR_IS_EOS) {
5829 /* All UTF-8 variants match */
5830 if (! UTF8_IS_INVARIANT(nextchr)) {
5831 goto increment_locinput;
5837 case POSIXA: /* \w or [:punct:] etc. under /a */
5840 /* We get here through POSIXD, NPOSIXD, and NPOSIXA when not in
5841 * UTF-8, and also from NPOSIXA even in UTF-8 when the current
5842 * character is a single byte */
5845 || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr,
5851 /* Here we are either not in utf8, or we matched a utf8-invariant,
5852 * so the next char is the next byte */
5856 case NPOSIXU: /* \W or [:^punct:] etc. under /u */
5860 case POSIXU: /* \w or [:punct:] etc. under /u */
5862 if (NEXTCHR_IS_EOS) {
5866 /* Use _generic_isCC() for characters within Latin1. (Note that
5867 * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
5868 * wouldn't be invariant) */
5869 if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) {
5870 if (! (to_complement ^ cBOOL(_generic_isCC(nextchr,
5877 else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
5878 if (! (to_complement
5879 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
5887 else { /* Handle above Latin-1 code points */
5888 utf8_posix_above_latin1:
5889 classnum = (_char_class_number) FLAGS(scan);
5890 if (classnum < _FIRST_NON_SWASH_CC) {
5892 /* Here, uses a swash to find such code points. Load if if
5893 * not done already */
5894 if (! PL_utf8_swash_ptrs[classnum]) {
5895 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
5896 PL_utf8_swash_ptrs[classnum]
5897 = _core_swash_init("utf8",
5900 PL_XPosix_ptrs[classnum], &flags);
5902 if (! (to_complement
5903 ^ cBOOL(swash_fetch(PL_utf8_swash_ptrs[classnum],
5904 (U8 *) locinput, TRUE))))
5909 else { /* Here, uses macros to find above Latin-1 code points */
5911 case _CC_ENUM_SPACE:
5912 if (! (to_complement
5913 ^ cBOOL(is_XPERLSPACE_high(locinput))))
5918 case _CC_ENUM_BLANK:
5919 if (! (to_complement
5920 ^ cBOOL(is_HORIZWS_high(locinput))))
5925 case _CC_ENUM_XDIGIT:
5926 if (! (to_complement
5927 ^ cBOOL(is_XDIGIT_high(locinput))))
5932 case _CC_ENUM_VERTSPACE:
5933 if (! (to_complement
5934 ^ cBOOL(is_VERTWS_high(locinput))))
5939 default: /* The rest, e.g. [:cntrl:], can't match
5941 if (! to_complement) {
5947 locinput += UTF8SKIP(locinput);
5951 case CLUMP: /* Match \X: logical Unicode character. This is defined as
5952 a Unicode extended Grapheme Cluster */
5955 if (! utf8_target) {
5957 /* Match either CR LF or '.', as all the other possibilities
5959 locinput++; /* Match the . or CR */
5960 if (nextchr == '\r' /* And if it was CR, and the next is LF,
5962 && locinput < reginfo->strend
5963 && UCHARAT(locinput) == '\n')
5970 /* Get the gcb type for the current character */
5971 GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
5972 (U8*) reginfo->strend);
5974 /* Then scan through the input until we get to the first
5975 * character whose type is supposed to be a gcb with the
5976 * current character. (There is always a break at the
5978 locinput += UTF8SKIP(locinput);
5979 while (locinput < reginfo->strend) {
5980 GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
5981 (U8*) reginfo->strend);
5982 if (isGCB(prev_gcb, cur_gcb)) {
5987 locinput += UTF8SKIP(locinput);
5994 case NREFFL: /* /\g{name}/il */
5995 { /* The capture buffer cases. The ones beginning with N for the
5996 named buffers just convert to the equivalent numbered and
5997 pretend they were called as the corresponding numbered buffer
5999 /* don't initialize these in the declaration, it makes C++
6004 const U8 *fold_array;
6007 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6008 folder = foldEQ_locale;
6009 fold_array = PL_fold_locale;
6011 utf8_fold_flags = FOLDEQ_LOCALE;
6014 case NREFFA: /* /\g{name}/iaa */
6015 folder = foldEQ_latin1;
6016 fold_array = PL_fold_latin1;
6018 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6021 case NREFFU: /* /\g{name}/iu */
6022 folder = foldEQ_latin1;
6023 fold_array = PL_fold_latin1;
6025 utf8_fold_flags = 0;
6028 case NREFF: /* /\g{name}/i */
6030 fold_array = PL_fold;
6032 utf8_fold_flags = 0;
6035 case NREF: /* /\g{name}/ */
6039 utf8_fold_flags = 0;
6042 /* For the named back references, find the corresponding buffer
6044 n = reg_check_named_buff_matched(rex,scan);
6049 goto do_nref_ref_common;
6051 case REFFL: /* /\1/il */
6052 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
6053 folder = foldEQ_locale;
6054 fold_array = PL_fold_locale;
6055 utf8_fold_flags = FOLDEQ_LOCALE;
6058 case REFFA: /* /\1/iaa */
6059 folder = foldEQ_latin1;
6060 fold_array = PL_fold_latin1;
6061 utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII;
6064 case REFFU: /* /\1/iu */
6065 folder = foldEQ_latin1;
6066 fold_array = PL_fold_latin1;
6067 utf8_fold_flags = 0;
6070 case REFF: /* /\1/i */
6072 fold_array = PL_fold;
6073 utf8_fold_flags = 0;
6076 case REF: /* /\1/ */
6079 utf8_fold_flags = 0;
6083 n = ARG(scan); /* which paren pair */
6086 ln = rex->offs[n].start;
6087 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6088 if (rex->lastparen < n || ln == -1)
6089 sayNO; /* Do not match unless seen CLOSEn. */
6090 if (ln == rex->offs[n].end)
6093 s = reginfo->strbeg + ln;
6094 if (type != REF /* REF can do byte comparison */
6095 && (utf8_target || type == REFFU || type == REFFL))
6097 char * limit = reginfo->strend;
6099 /* This call case insensitively compares the entire buffer
6100 * at s, with the current input starting at locinput, but
6101 * not going off the end given by reginfo->strend, and
6102 * returns in <limit> upon success, how much of the
6103 * current input was matched */
6104 if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target,
6105 locinput, &limit, 0, utf8_target, utf8_fold_flags))
6113 /* Not utf8: Inline the first character, for speed. */
6114 if (!NEXTCHR_IS_EOS &&
6115 UCHARAT(s) != nextchr &&
6117 UCHARAT(s) != fold_array[nextchr]))
6119 ln = rex->offs[n].end - ln;
6120 if (locinput + ln > reginfo->strend)
6122 if (ln > 1 && (type == REF
6123 ? memNE(s, locinput, ln)
6124 : ! folder(s, locinput, ln)))
6130 case NOTHING: /* null op; e.g. the 'nothing' following
6131 * the '*' in m{(a+|b)*}' */
6133 case TAIL: /* placeholder while compiling (A|B|C) */
6137 #define ST st->u.eval
6142 regexp_internal *rei;
6143 regnode *startpoint;
6145 case GOSTART: /* (?R) */
6146 case GOSUB: /* /(...(?1))/ /(...(?&foo))/ */
6147 if (cur_eval && cur_eval->locinput==locinput) {
6148 if (cur_eval->u.eval.close_paren == (U32)ARG(scan))
6149 Perl_croak(aTHX_ "Infinite recursion in regex");
6150 if ( ++nochange_depth > max_nochange_depth )
6152 "Pattern subroutine nesting without pos change"
6153 " exceeded limit in regex");
6160 if (OP(scan)==GOSUB) {
6161 startpoint = scan + ARG2L(scan);
6162 ST.close_paren = ARG(scan);
6164 startpoint = rei->program+1;
6168 /* Save all the positions seen so far. */
6169 ST.cp = regcppush(rex, 0, maxopenparen);
6170 REGCP_SET(ST.lastcp);
6172 /* and then jump to the code we share with EVAL */
6173 goto eval_recurse_doit;
6176 case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
6177 if (cur_eval && cur_eval->locinput==locinput) {
6178 if ( ++nochange_depth > max_nochange_depth )
6179 Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex");
6184 /* execute the code in the {...} */
6188 OP * const oop = PL_op;
6189 COP * const ocurcop = PL_curcop;
6193 /* save *all* paren positions */
6194 regcppush(rex, 0, maxopenparen);
6195 REGCP_SET(runops_cp);
6198 caller_cv = find_runcv(NULL);
6202 if (rexi->data->what[n] == 'r') { /* code from an external qr */
6204 (REGEXP*)(rexi->data->data[n])
6207 nop = (OP*)rexi->data->data[n+1];
6209 else if (rexi->data->what[n] == 'l') { /* literal code */
6211 nop = (OP*)rexi->data->data[n];
6212 assert(CvDEPTH(newcv));
6215 /* literal with own CV */
6216 assert(rexi->data->what[n] == 'L');
6217 newcv = rex->qr_anoncv;
6218 nop = (OP*)rexi->data->data[n];
6221 /* normally if we're about to execute code from the same
6222 * CV that we used previously, we just use the existing
6223 * CX stack entry. However, its possible that in the
6224 * meantime we may have backtracked, popped from the save
6225 * stack, and undone the SAVECOMPPAD(s) associated with
6226 * PUSH_MULTICALL; in which case PL_comppad no longer
6227 * points to newcv's pad. */
6228 if (newcv != last_pushed_cv || PL_comppad != last_pad)
6230 U8 flags = (CXp_SUB_RE |
6231 ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
6232 if (last_pushed_cv) {
6233 CHANGE_MULTICALL_FLAGS(newcv, flags);
6236 PUSH_MULTICALL_FLAGS(newcv, flags);
6238 last_pushed_cv = newcv;
6241 /* these assignments are just to silence compiler
6243 multicall_cop = NULL;
6246 last_pad = PL_comppad;
6248 /* the initial nextstate you would normally execute
6249 * at the start of an eval (which would cause error
6250 * messages to come from the eval), may be optimised
6251 * away from the execution path in the regex code blocks;
6252 * so manually set PL_curcop to it initially */
6254 OP *o = cUNOPx(nop)->op_first;
6255 assert(o->op_type == OP_NULL);
6256 if (o->op_targ == OP_SCOPE) {
6257 o = cUNOPo->op_first;
6260 assert(o->op_targ == OP_LEAVE);
6261 o = cUNOPo->op_first;
6262 assert(o->op_type == OP_ENTER);
6266 if (o->op_type != OP_STUB) {
6267 assert( o->op_type == OP_NEXTSTATE
6268 || o->op_type == OP_DBSTATE
6269 || (o->op_type == OP_NULL
6270 && ( o->op_targ == OP_NEXTSTATE
6271 || o->op_targ == OP_DBSTATE
6275 PL_curcop = (COP*)o;
6280 DEBUG_STATE_r( PerlIO_printf(Perl_debug_log,
6281 " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) );
6283 rex->offs[0].end = locinput - reginfo->strbeg;
6284 if (reginfo->info_aux_eval->pos_magic)
6285 MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic,
6286 reginfo->sv, reginfo->strbeg,
6287 locinput - reginfo->strbeg);
6290 SV *sv_mrk = get_sv("REGMARK", 1);
6291 sv_setsv(sv_mrk, sv_yes_mark);
6294 /* we don't use MULTICALL here as we want to call the
6295 * first op of the block of interest, rather than the
6296 * first op of the sub */
6297 before = (IV)(SP-PL_stack_base);
6299 CALLRUNOPS(aTHX); /* Scalar context. */
6301 if ((IV)(SP-PL_stack_base) == before)
6302 ret = &PL_sv_undef; /* protect against empty (?{}) blocks. */
6308 /* before restoring everything, evaluate the returned
6309 * value, so that 'uninit' warnings don't use the wrong
6310 * PL_op or pad. Also need to process any magic vars
6311 * (e.g. $1) *before* parentheses are restored */
6316 if (logical == 0) /* (?{})/ */
6317 sv_setsv(save_scalar(PL_replgv), ret); /* $^R */
6318 else if (logical == 1) { /* /(?(?{...})X|Y)/ */
6319 sw = cBOOL(SvTRUE(ret));
6322 else { /* /(??{}) */
6323 /* if its overloaded, let the regex compiler handle
6324 * it; otherwise extract regex, or stringify */
6325 if (SvGMAGICAL(ret))
6326 ret = sv_mortalcopy(ret);
6327 if (!SvAMAGIC(ret)) {
6331 if (SvTYPE(sv) == SVt_REGEXP)
6332 re_sv = (REGEXP*) sv;
6333 else if (SvSMAGICAL(ret)) {
6334 MAGIC *mg = mg_find(ret, PERL_MAGIC_qr);
6336 re_sv = (REGEXP *) mg->mg_obj;
6339 /* force any undef warnings here */
6340 if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) {
6341 ret = sv_mortalcopy(ret);
6342 (void) SvPV_force_nolen(ret);
6348 /* *** Note that at this point we don't restore
6349 * PL_comppad, (or pop the CxSUB) on the assumption it may
6350 * be used again soon. This is safe as long as nothing
6351 * in the regexp code uses the pad ! */
6353 PL_curcop = ocurcop;
6354 S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen);
6355 PL_curpm = PL_reg_curpm;
6361 /* only /(??{})/ from now on */
6364 /* extract RE object from returned value; compiling if
6368 re_sv = reg_temp_copy(NULL, re_sv);
6373 if (SvUTF8(ret) && IN_BYTES) {
6374 /* In use 'bytes': make a copy of the octet
6375 * sequence, but without the flag on */
6377 const char *const p = SvPV(ret, len);
6378 ret = newSVpvn_flags(p, len, SVs_TEMP);
6380 if (rex->intflags & PREGf_USE_RE_EVAL)
6381 pm_flags |= PMf_USE_RE_EVAL;
6383 /* if we got here, it should be an engine which
6384 * supports compiling code blocks and stuff */
6385 assert(rex->engine && rex->engine->op_comp);
6386 assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
6387 re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
6388 rex->engine, NULL, NULL,
6389 /* copy /msixn etc to inner pattern */
6394 & (SVs_TEMP | SVs_GMG | SVf_ROK))
6395 && (!SvPADTMP(ret) || SvREADONLY(ret))) {
6396 /* This isn't a first class regexp. Instead, it's
6397 caching a regexp onto an existing, Perl visible
6399 sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0);
6405 RXp_MATCH_COPIED_off(re);
6406 re->subbeg = rex->subbeg;
6407 re->sublen = rex->sublen;
6408 re->suboffset = rex->suboffset;
6409 re->subcoffset = rex->subcoffset;
6411 re->lastcloseparen = 0;
6414 debug_start_match(re_sv, utf8_target, locinput,
6415 reginfo->strend, "Matching embedded");
6417 startpoint = rei->program + 1;
6418 ST.close_paren = 0; /* only used for GOSUB */
6419 /* Save all the seen positions so far. */
6420 ST.cp = regcppush(rex, 0, maxopenparen);
6421 REGCP_SET(ST.lastcp);
6422 /* and set maxopenparen to 0, since we are starting a "fresh" match */
6424 /* run the pattern returned from (??{...}) */
6426 eval_recurse_doit: /* Share code with GOSUB below this line
6427 * At this point we expect the stack context to be
6428 * set up correctly */
6430 /* invalidate the S-L poscache. We're now executing a
6431 * different set of WHILEM ops (and their associated
6432 * indexes) against the same string, so the bits in the
6433 * cache are meaningless. Setting maxiter to zero forces
6434 * the cache to be invalidated and zeroed before reuse.
6435 * XXX This is too dramatic a measure. Ideally we should
6436 * save the old cache and restore when running the outer
6438 reginfo->poscache_maxiter = 0;
6440 /* the new regexp might have a different is_utf8_pat than we do */
6441 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(re_sv));
6443 ST.prev_rex = rex_sv;
6444 ST.prev_curlyx = cur_curlyx;
6446 SET_reg_curpm(rex_sv);
6451 ST.prev_eval = cur_eval;
6453 /* now continue from first node in postoned RE */
6454 PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
6456 NOT_REACHED; /* NOTREACHED */
6459 case EVAL_AB: /* cleanup after a successful (??{A})B */
6460 /* note: this is called twice; first after popping B, then A */
6461 rex_sv = ST.prev_rex;
6462 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6463 SET_reg_curpm(rex_sv);
6464 rex = ReANY(rex_sv);
6465 rexi = RXi_GET(rex);
6467 /* preserve $^R across LEAVE's. See Bug 121070. */
6468 SV *save_sv= GvSV(PL_replgv);
6469 SvREFCNT_inc(save_sv);
6470 regcpblow(ST.cp); /* LEAVE in disguise */
6471 sv_setsv(GvSV(PL_replgv), save_sv);
6472 SvREFCNT_dec(save_sv);
6474 cur_eval = ST.prev_eval;
6475 cur_curlyx = ST.prev_curlyx;
6477 /* Invalidate cache. See "invalidate" comment above. */
6478 reginfo->poscache_maxiter = 0;
6479 if ( nochange_depth )
6484 case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */
6485 /* note: this is called twice; first after popping B, then A */
6486 rex_sv = ST.prev_rex;
6487 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
6488 SET_reg_curpm(rex_sv);
6489 rex = ReANY(rex_sv);
6490 rexi = RXi_GET(rex);
6492 REGCP_UNWIND(ST.lastcp);
6493 regcppop(rex, &maxopenparen);
6494 cur_eval = ST.prev_eval;
6495 cur_curlyx = ST.prev_curlyx;
6496 /* Invalidate cache. See "invalidate" comment above. */
6497 reginfo->poscache_maxiter = 0;
6498 if ( nochange_depth )
6504 n = ARG(scan); /* which paren pair */
6505 rex->offs[n].start_tmp = locinput - reginfo->strbeg;
6506 if (n > maxopenparen)
6508 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log,
6509 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; maxopenparen=%"UVuf"\n",
6513 (IV)rex->offs[n].start_tmp,
6519 /* XXX really need to log other places start/end are set too */
6520 #define CLOSE_CAPTURE \
6521 rex->offs[n].start = rex->offs[n].start_tmp; \
6522 rex->offs[n].end = locinput - reginfo->strbeg; \
6523 DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \
6524 "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \
6526 PTR2UV(rex->offs), \
6528 (IV)rex->offs[n].start, \
6529 (IV)rex->offs[n].end \
6533 n = ARG(scan); /* which paren pair */
6535 if (n > rex->lastparen)
6537 rex->lastcloseparen = n;
6538 if (cur_eval && cur_eval->u.eval.close_paren == n) {
6543 case ACCEPT: /* (*ACCEPT) */
6547 cursor && OP(cursor)!=END;
6548 cursor=regnext(cursor))
6550 if ( OP(cursor)==CLOSE ){
6552 if ( n <= lastopen ) {
6554 if (n > rex->lastparen)
6556 rex->lastcloseparen = n;
6557 if ( n == ARG(scan) || (cur_eval &&
6558 cur_eval->u.eval.close_paren == n))
6567 case GROUPP: /* (?(1)) */
6568 n = ARG(scan); /* which paren pair */
6569 sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1);
6572 case NGROUPP: /* (?(<name>)) */
6573 /* reg_check_named_buff_matched returns 0 for no match */
6574 sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan));
6577 case INSUBP: /* (?(R)) */
6579 sw = (cur_eval && (!n || cur_eval->u.eval.close_paren == n));
6582 case DEFINEP: /* (?(DEFINE)) */
6586 case IFTHEN: /* (?(cond)A|B) */
6587 reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */
6589 next = NEXTOPER(NEXTOPER(scan));
6591 next = scan + ARG(scan);
6592 if (OP(next) == IFTHEN) /* Fake one. */
6593 next = NEXTOPER(NEXTOPER(next));
6597 case LOGICAL: /* modifier for EVAL and IFMATCH */
6598 logical = scan->flags;
6601 /*******************************************************************
6603 The CURLYX/WHILEM pair of ops handle the most generic case of the /A*B/
6604 pattern, where A and B are subpatterns. (For simple A, CURLYM or
6605 STAR/PLUS/CURLY/CURLYN are used instead.)
6607 A*B is compiled as <CURLYX><A><WHILEM><B>
6609 On entry to the subpattern, CURLYX is called. This pushes a CURLYX
6610 state, which contains the current count, initialised to -1. It also sets
6611 cur_curlyx to point to this state, with any previous value saved in the
6614 CURLYX then jumps straight to the WHILEM op, rather than executing A,
6615 since the pattern may possibly match zero times (i.e. it's a while {} loop
6616 rather than a do {} while loop).
6618 Each entry to WHILEM represents a successful match of A. The count in the
6619 CURLYX block is incremented, another WHILEM state is pushed, and execution
6620 passes to A or B depending on greediness and the current count.
6622 For example, if matching against the string a1a2a3b (where the aN are
6623 substrings that match /A/), then the match progresses as follows: (the
6624 pushed states are interspersed with the bits of strings matched so far):
6627 <CURLYX cnt=0><WHILEM>
6628 <CURLYX cnt=1><WHILEM> a1 <WHILEM>
6629 <CURLYX cnt=2><WHILEM> a1 <WHILEM> a2 <WHILEM>
6630 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM>
6631 <CURLYX cnt=3><WHILEM> a1 <WHILEM> a2 <WHILEM> a3 <WHILEM> b
6633 (Contrast this with something like CURLYM, which maintains only a single
6637 a1 <CURLYM cnt=1> a2
6638 a1 a2 <CURLYM cnt=2> a3
6639 a1 a2 a3 <CURLYM cnt=3> b
6642 Each WHILEM state block marks a point to backtrack to upon partial failure
6643 of A or B, and also contains some minor state data related to that
6644 iteration. The CURLYX block, pointed to by cur_curlyx, contains the
6645 overall state, such as the count, and pointers to the A and B ops.
6647 This is complicated slightly by nested CURLYX/WHILEM's. Since cur_curlyx
6648 must always point to the *current* CURLYX block, the rules are:
6650 When executing CURLYX, save the old cur_curlyx in the CURLYX state block,
6651 and set cur_curlyx to point the new block.
6653 When popping the CURLYX block after a successful or unsuccessful match,
6654 restore the previous cur_curlyx.
6656 When WHILEM is about to execute B, save the current cur_curlyx, and set it
6657 to the outer one saved in the CURLYX block.
6659 When popping the WHILEM block after a successful or unsuccessful B match,
6660 restore the previous cur_curlyx.
6662 Here's an example for the pattern (AI* BI)*BO
6663 I and O refer to inner and outer, C and W refer to CURLYX and WHILEM:
6666 curlyx backtrack stack
6667 ------ ---------------
6669 CO <CO prev=NULL> <WO>
6670 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6671 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6672 NULL <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi <WO prev=CO> bo
6674 At this point the pattern succeeds, and we work back down the stack to
6675 clean up, restoring as we go:
6677 CO <CO prev=NULL> <WO> <CI prev=CO> <WI> ai <WI prev=CI> bi
6678 CI <CO prev=NULL> <WO> <CI prev=CO> <WI> ai
6679 CO <CO prev=NULL> <WO>
6682 *******************************************************************/
6684 #define ST st->u.curlyx
6686 case CURLYX: /* start of /A*B/ (for complex A) */
6688 /* No need to save/restore up to this paren */
6689 I32 parenfloor = scan->flags;
6691 assert(next); /* keep Coverity happy */
6692 if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */
6695 /* XXXX Probably it is better to teach regpush to support
6696 parenfloor > maxopenparen ... */
6697 if (parenfloor > (I32)rex->lastparen)
6698 parenfloor = rex->lastparen; /* Pessimization... */
6700 ST.prev_curlyx= cur_curlyx;
6702 ST.cp = PL_savestack_ix;
6704 /* these fields contain the state of the current curly.
6705 * they are accessed by subsequent WHILEMs */
6706 ST.parenfloor = parenfloor;
6711 ST.count = -1; /* this will be updated by WHILEM */
6712 ST.lastloc = NULL; /* this will be updated by WHILEM */
6714 PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
6716 NOT_REACHED; /* NOTREACHED */
6719 case CURLYX_end: /* just finished matching all of A*B */
6720 cur_curlyx = ST.prev_curlyx;
6723 NOT_REACHED; /* NOTREACHED */
6725 case CURLYX_end_fail: /* just failed to match all of A*B */
6727 cur_curlyx = ST.prev_curlyx;
6730 NOT_REACHED; /* NOTREACHED */
6734 #define ST st->u.whilem
6736 case WHILEM: /* just matched an A in /A*B/ (for complex A) */
6738 /* see the discussion above about CURLYX/WHILEM */
6743 assert(cur_curlyx); /* keep Coverity happy */
6745 min = ARG1(cur_curlyx->u.curlyx.me);
6746 max = ARG2(cur_curlyx->u.curlyx.me);
6747 A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS;
6748 n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */
6749 ST.save_lastloc = cur_curlyx->u.curlyx.lastloc;
6750 ST.cache_offset = 0;
6754 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6755 "%*s whilem: matched %ld out of %d..%d\n",
6756 REPORT_CODE_OFF+depth*2, "", (long)n, min, max)
6759 /* First just match a string of min A's. */
6762 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6764 cur_curlyx->u.curlyx.lastloc = locinput;
6765 REGCP_SET(ST.lastcp);
6767 PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
6769 NOT_REACHED; /* NOTREACHED */
6772 /* If degenerate A matches "", assume A done. */
6774 if (locinput == cur_curlyx->u.curlyx.lastloc) {
6775 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6776 "%*s whilem: empty match detected, trying continuation...\n",
6777 REPORT_CODE_OFF+depth*2, "")
6779 goto do_whilem_B_max;
6782 /* super-linear cache processing.
6784 * The idea here is that for certain types of CURLYX/WHILEM -
6785 * principally those whose upper bound is infinity (and
6786 * excluding regexes that have things like \1 and other very
6787 * non-regular expresssiony things), then if a pattern like
6788 * /....A*.../ fails and we backtrack to the WHILEM, then we
6789 * make a note that this particular WHILEM op was at string
6790 * position 47 (say) when the rest of pattern failed. Then, if
6791 * we ever find ourselves back at that WHILEM, and at string
6792 * position 47 again, we can just fail immediately rather than
6793 * running the rest of the pattern again.
6795 * This is very handy when patterns start to go
6796 * 'super-linear', like in (a+)*(a+)*(a+)*, where you end up
6797 * with a combinatorial explosion of backtracking.
6799 * The cache is implemented as a bit array, with one bit per
6800 * string byte position per WHILEM op (up to 16) - so its
6801 * between 0.25 and 2x the string size.
6803 * To avoid allocating a poscache buffer every time, we do an
6804 * initially countdown; only after we have executed a WHILEM
6805 * op (string-length x #WHILEMs) times do we allocate the
6808 * The top 4 bits of scan->flags byte say how many different
6809 * relevant CURLLYX/WHILEM op pairs there are, while the
6810 * bottom 4-bits is the identifying index number of this
6816 if (!reginfo->poscache_maxiter) {
6817 /* start the countdown: Postpone detection until we
6818 * know the match is not *that* much linear. */
6819 reginfo->poscache_maxiter
6820 = (reginfo->strend - reginfo->strbeg + 1)
6822 /* possible overflow for long strings and many CURLYX's */
6823 if (reginfo->poscache_maxiter < 0)
6824 reginfo->poscache_maxiter = I32_MAX;
6825 reginfo->poscache_iter = reginfo->poscache_maxiter;
6828 if (reginfo->poscache_iter-- == 0) {
6829 /* initialise cache */
6830 const SSize_t size = (reginfo->poscache_maxiter + 7)/8;
6831 regmatch_info_aux *const aux = reginfo->info_aux;
6832 if (aux->poscache) {
6833 if ((SSize_t)reginfo->poscache_size < size) {
6834 Renew(aux->poscache, size, char);
6835 reginfo->poscache_size = size;
6837 Zero(aux->poscache, size, char);
6840 reginfo->poscache_size = size;
6841 Newxz(aux->poscache, size, char);
6843 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6844 "%swhilem: Detected a super-linear match, switching on caching%s...\n",
6845 PL_colors[4], PL_colors[5])
6849 if (reginfo->poscache_iter < 0) {
6850 /* have we already failed at this position? */
6851 SSize_t offset, mask;
6853 reginfo->poscache_iter = -1; /* stop eventual underflow */
6854 offset = (scan->flags & 0xf) - 1
6855 + (locinput - reginfo->strbeg)
6857 mask = 1 << (offset % 8);
6859 if (reginfo->info_aux->poscache[offset] & mask) {
6860 DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
6861 "%*s whilem: (cache) already tried at this position...\n",
6862 REPORT_CODE_OFF+depth*2, "")
6864 sayNO; /* cache records failure */
6866 ST.cache_offset = offset;
6867 ST.cache_mask = mask;
6871 /* Prefer B over A for minimal matching. */
6873 if (cur_curlyx->u.curlyx.minmod) {
6874 ST.save_curlyx = cur_curlyx;
6875 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6876 ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor,
6878 REGCP_SET(ST.lastcp);
6879 PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
6882 NOT_REACHED; /* NOTREACHED */
6885 /* Prefer A over B for maximal matching. */
6887 if (n < max) { /* More greed allowed? */
6888 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6890 cur_curlyx->u.curlyx.lastloc = locinput;
6891 REGCP_SET(ST.lastcp);
6892 PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
6894 NOT_REACHED; /* NOTREACHED */
6896 goto do_whilem_B_max;
6899 NOT_REACHED; /* NOTREACHED */
6901 case WHILEM_B_min: /* just matched B in a minimal match */
6902 case WHILEM_B_max: /* just matched B in a maximal match */
6903 cur_curlyx = ST.save_curlyx;
6906 NOT_REACHED; /* NOTREACHED */
6908 case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
6909 cur_curlyx = ST.save_curlyx;
6910 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6911 cur_curlyx->u.curlyx.count--;
6914 NOT_REACHED; /* NOTREACHED */
6916 case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
6918 case WHILEM_A_pre_fail: /* just failed to match even minimal A */
6919 REGCP_UNWIND(ST.lastcp);
6920 regcppop(rex, &maxopenparen);
6921 cur_curlyx->u.curlyx.lastloc = ST.save_lastloc;
6922 cur_curlyx->u.curlyx.count--;
6925 NOT_REACHED; /* NOTREACHED */
6927 case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
6928 REGCP_UNWIND(ST.lastcp);
6929 regcppop(rex, &maxopenparen); /* Restore some previous $<digit>s? */
6930 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6931 "%*s whilem: failed, trying continuation...\n",
6932 REPORT_CODE_OFF+depth*2, "")
6935 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6936 && ckWARN(WARN_REGEXP)
6937 && !reginfo->warned)
6939 reginfo->warned = TRUE;
6940 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6941 "Complex regular subexpression recursion limit (%d) "
6947 ST.save_curlyx = cur_curlyx;
6948 cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx;
6949 PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
6952 NOT_REACHED; /* NOTREACHED */
6954 case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
6955 cur_curlyx = ST.save_curlyx;
6956 REGCP_UNWIND(ST.lastcp);
6957 regcppop(rex, &maxopenparen);
6959 if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) {
6960 /* Maximum greed exceeded */
6961 if (cur_curlyx->u.curlyx.count >= REG_INFTY
6962 && ckWARN(WARN_REGEXP)
6963 && !reginfo->warned)
6965 reginfo->warned = TRUE;
6966 Perl_warner(aTHX_ packWARN(WARN_REGEXP),
6967 "Complex regular subexpression recursion "
6968 "limit (%d) exceeded",
6971 cur_curlyx->u.curlyx.count--;
6975 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
6976 "%*s trying longer...\n", REPORT_CODE_OFF+depth*2, "")
6978 /* Try grabbing another A and see if it helps. */
6979 cur_curlyx->u.curlyx.lastloc = locinput;
6980 ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor,
6982 REGCP_SET(ST.lastcp);
6983 PUSH_STATE_GOTO(WHILEM_A_min,
6984 /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
6987 NOT_REACHED; /* NOTREACHED */
6990 #define ST st->u.branch
6992 case BRANCHJ: /* /(...|A|...)/ with long next pointer */
6993 next = scan + ARG(scan);
6996 scan = NEXTOPER(scan);
6999 case BRANCH: /* /(...|A|...)/ */
7000 scan = NEXTOPER(scan); /* scan now points to inner node */
7001 ST.lastparen = rex->lastparen;
7002 ST.lastcloseparen = rex->lastcloseparen;
7003 ST.next_branch = next;
7006 /* Now go into the branch */
7008 PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput);
7010 PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
7013 NOT_REACHED; /* NOTREACHED */
7015 case CUTGROUP: /* /(*THEN)/ */
7016 sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
7017 MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7018 PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
7020 NOT_REACHED; /* NOTREACHED */
7022 case CUTGROUP_next_fail:
7025 if (st->u.mark.mark_name)
7026 sv_commit = st->u.mark.mark_name;
7029 NOT_REACHED; /* NOTREACHED */
7034 NOT_REACHED; /* NOTREACHED */
7036 case BRANCH_next_fail: /* that branch failed; try the next, if any */
7041 REGCP_UNWIND(ST.cp);
7042 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7043 scan = ST.next_branch;
7044 /* no more branches? */
7045 if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) {
7047 PerlIO_printf( Perl_debug_log,
7048 "%*s %sBRANCH failed...%s\n",
7049 REPORT_CODE_OFF+depth*2, "",
7055 continue; /* execute next BRANCH[J] op */
7058 case MINMOD: /* next op will be non-greedy, e.g. A*? */
7063 #define ST st->u.curlym
7065 case CURLYM: /* /A{m,n}B/ where A is fixed-length */
7067 /* This is an optimisation of CURLYX that enables us to push
7068 * only a single backtracking state, no matter how many matches
7069 * there are in {m,n}. It relies on the pattern being constant
7070 * length, with no parens to influence future backrefs
7074 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7076 ST.lastparen = rex->lastparen;
7077 ST.lastcloseparen = rex->lastcloseparen;
7079 /* if paren positive, emulate an OPEN/CLOSE around A */
7081 U32 paren = ST.me->flags;
7082 if (paren > maxopenparen)
7083 maxopenparen = paren;
7084 scan += NEXT_OFF(scan); /* Skip former OPEN. */
7092 ST.c1 = CHRTEST_UNINIT;
7095 if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */
7098 curlym_do_A: /* execute the A in /A{m,n}B/ */
7099 PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
7101 NOT_REACHED; /* NOTREACHED */
7103 case CURLYM_A: /* we've just matched an A */
7105 /* after first match, determine A's length: u.curlym.alen */
7106 if (ST.count == 1) {
7107 if (reginfo->is_utf8_target) {
7108 char *s = st->locinput;
7109 while (s < locinput) {
7115 ST.alen = locinput - st->locinput;
7118 ST.count = ST.minmod ? ARG1(ST.me) : ARG2(ST.me);
7121 PerlIO_printf(Perl_debug_log,
7122 "%*s CURLYM now matched %"IVdf" times, len=%"IVdf"...\n",
7123 (int)(REPORT_CODE_OFF+(depth*2)), "",
7124 (IV) ST.count, (IV)ST.alen)
7127 if (cur_eval && cur_eval->u.eval.close_paren &&
7128 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7132 I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me));
7133 if ( max == REG_INFTY || ST.count < max )
7134 goto curlym_do_A; /* try to match another A */
7136 goto curlym_do_B; /* try to match B */
7138 case CURLYM_A_fail: /* just failed to match an A */
7139 REGCP_UNWIND(ST.cp);
7141 if (ST.minmod || ST.count < ARG1(ST.me) /* min*/
7142 || (cur_eval && cur_eval->u.eval.close_paren &&
7143 cur_eval->u.eval.close_paren == (U32)ST.me->flags))
7146 curlym_do_B: /* execute the B in /A{m,n}B/ */
7147 if (ST.c1 == CHRTEST_UNINIT) {
7148 /* calculate c1 and c2 for possible match of 1st char
7149 * following curly */
7150 ST.c1 = ST.c2 = CHRTEST_VOID;
7152 if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) {
7153 regnode *text_node = ST.B;
7154 if (! HAS_TEXT(text_node))
7155 FIND_NEXT_IMPT(text_node);
7158 (HAS_TEXT(text_node) && PL_regkind[OP(text_node)] == EXACT)
7160 But the former is redundant in light of the latter.
7162 if this changes back then the macro for
7163 IS_TEXT and friends need to change.
7165 if (PL_regkind[OP(text_node)] == EXACT) {
7166 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7167 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7177 PerlIO_printf(Perl_debug_log,
7178 "%*s CURLYM trying tail with matches=%"IVdf"...\n",
7179 (int)(REPORT_CODE_OFF+(depth*2)),
7182 if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) {
7183 if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) {
7184 if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7185 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7187 /* simulate B failing */
7189 PerlIO_printf(Perl_debug_log,
7190 "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n",
7191 (int)(REPORT_CODE_OFF+(depth*2)),"",
7192 valid_utf8_to_uvchr((U8 *) locinput, NULL),
7193 valid_utf8_to_uvchr(ST.c1_utf8, NULL),
7194 valid_utf8_to_uvchr(ST.c2_utf8, NULL))
7196 state_num = CURLYM_B_fail;
7197 goto reenter_switch;
7200 else if (nextchr != ST.c1 && nextchr != ST.c2) {
7201 /* simulate B failing */
7203 PerlIO_printf(Perl_debug_log,
7204 "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n",
7205 (int)(REPORT_CODE_OFF+(depth*2)),"",
7206 (int) nextchr, ST.c1, ST.c2)
7208 state_num = CURLYM_B_fail;
7209 goto reenter_switch;
7214 /* emulate CLOSE: mark current A as captured */
7215 I32 paren = ST.me->flags;
7217 rex->offs[paren].start
7218 = HOPc(locinput, -ST.alen) - reginfo->strbeg;
7219 rex->offs[paren].end = locinput - reginfo->strbeg;
7220 if ((U32)paren > rex->lastparen)
7221 rex->lastparen = paren;
7222 rex->lastcloseparen = paren;
7225 rex->offs[paren].end = -1;
7226 if (cur_eval && cur_eval->u.eval.close_paren &&
7227 cur_eval->u.eval.close_paren == (U32)ST.me->flags)
7236 PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
7238 NOT_REACHED; /* NOTREACHED */
7240 case CURLYM_B_fail: /* just failed to match a B */
7241 REGCP_UNWIND(ST.cp);
7242 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7244 I32 max = ARG2(ST.me);
7245 if (max != REG_INFTY && ST.count == max)
7247 goto curlym_do_A; /* try to match a further A */
7249 /* backtrack one A */
7250 if (ST.count == ARG1(ST.me) /* min */)
7253 SET_locinput(HOPc(locinput, -ST.alen));
7254 goto curlym_do_B; /* try to match B */
7257 #define ST st->u.curly
7259 #define CURLY_SETPAREN(paren, success) \
7262 rex->offs[paren].start = HOPc(locinput, -1) - reginfo->strbeg; \
7263 rex->offs[paren].end = locinput - reginfo->strbeg; \
7264 if (paren > rex->lastparen) \
7265 rex->lastparen = paren; \
7266 rex->lastcloseparen = paren; \
7269 rex->offs[paren].end = -1; \
7270 rex->lastparen = ST.lastparen; \
7271 rex->lastcloseparen = ST.lastcloseparen; \
7275 case STAR: /* /A*B/ where A is width 1 char */
7279 scan = NEXTOPER(scan);
7282 case PLUS: /* /A+B/ where A is width 1 char */
7286 scan = NEXTOPER(scan);
7289 case CURLYN: /* /(A){m,n}B/ where A is width 1 char */
7290 ST.paren = scan->flags; /* Which paren to set */
7291 ST.lastparen = rex->lastparen;
7292 ST.lastcloseparen = rex->lastcloseparen;
7293 if (ST.paren > maxopenparen)
7294 maxopenparen = ST.paren;
7295 ST.min = ARG1(scan); /* min to match */
7296 ST.max = ARG2(scan); /* max to match */
7297 if (cur_eval && cur_eval->u.eval.close_paren &&
7298 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7302 scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE);
7305 case CURLY: /* /A{m,n}B/ where A is width 1 char */
7307 ST.min = ARG1(scan); /* min to match */
7308 ST.max = ARG2(scan); /* max to match */
7309 scan = NEXTOPER(scan) + NODE_STEP_REGNODE;
7312 * Lookahead to avoid useless match attempts
7313 * when we know what character comes next.
7315 * Used to only do .*x and .*?x, but now it allows
7316 * for )'s, ('s and (?{ ... })'s to be in the way
7317 * of the quantifier and the EXACT-like node. -- japhy
7320 assert(ST.min <= ST.max);
7321 if (! HAS_TEXT(next) && ! JUMPABLE(next)) {
7322 ST.c1 = ST.c2 = CHRTEST_VOID;
7325 regnode *text_node = next;
7327 if (! HAS_TEXT(text_node))
7328 FIND_NEXT_IMPT(text_node);
7330 if (! HAS_TEXT(text_node))
7331 ST.c1 = ST.c2 = CHRTEST_VOID;
7333 if ( PL_regkind[OP(text_node)] != EXACT ) {
7334 ST.c1 = ST.c2 = CHRTEST_VOID;
7338 /* Currently we only get here when
7340 PL_rekind[OP(text_node)] == EXACT
7342 if this changes back then the macro for IS_TEXT and
7343 friends need to change. */
7344 if (! S_setup_EXACTISH_ST_c1_c2(aTHX_
7345 text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8,
7357 char *li = locinput;
7360 regrepeat(rex, &li, ST.A, reginfo, ST.min, depth)
7366 if (ST.c1 == CHRTEST_VOID)
7367 goto curly_try_B_min;
7369 ST.oldloc = locinput;
7371 /* set ST.maxpos to the furthest point along the
7372 * string that could possibly match */
7373 if (ST.max == REG_INFTY) {
7374 ST.maxpos = reginfo->strend - 1;
7376 while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos))
7379 else if (utf8_target) {
7380 int m = ST.max - ST.min;
7381 for (ST.maxpos = locinput;
7382 m >0 && ST.maxpos < reginfo->strend; m--)
7383 ST.maxpos += UTF8SKIP(ST.maxpos);
7386 ST.maxpos = locinput + ST.max - ST.min;
7387 if (ST.maxpos >= reginfo->strend)
7388 ST.maxpos = reginfo->strend - 1;
7390 goto curly_try_B_min_known;
7394 /* avoid taking address of locinput, so it can remain
7396 char *li = locinput;
7397 ST.count = regrepeat(rex, &li, ST.A, reginfo, ST.max, depth);
7398 if (ST.count < ST.min)
7401 if ((ST.count > ST.min)
7402 && (PL_regkind[OP(ST.B)] == EOL) && (OP(ST.B) != MEOL))
7404 /* A{m,n} must come at the end of the string, there's
7405 * no point in backing off ... */
7407 /* ...except that $ and \Z can match before *and* after
7408 newline at the end. Consider "\n\n" =~ /\n+\Z\n/.
7409 We may back off by one in this case. */
7410 if (UCHARAT(locinput - 1) == '\n' && OP(ST.B) != EOS)
7414 goto curly_try_B_max;
7417 NOT_REACHED; /* NOTREACHED */
7419 case CURLY_B_min_known_fail:
7420 /* failed to find B in a non-greedy match where c1,c2 valid */
7422 REGCP_UNWIND(ST.cp);
7424 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7426 /* Couldn't or didn't -- move forward. */
7427 ST.oldloc = locinput;
7429 locinput += UTF8SKIP(locinput);
7433 curly_try_B_min_known:
7434 /* find the next place where 'B' could work, then call B */
7438 n = (ST.oldloc == locinput) ? 0 : 1;
7439 if (ST.c1 == ST.c2) {
7440 /* set n to utf8_distance(oldloc, locinput) */
7441 while (locinput <= ST.maxpos
7442 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)))
7444 locinput += UTF8SKIP(locinput);
7449 /* set n to utf8_distance(oldloc, locinput) */
7450 while (locinput <= ST.maxpos
7451 && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))
7452 && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput)))
7454 locinput += UTF8SKIP(locinput);
7459 else { /* Not utf8_target */
7460 if (ST.c1 == ST.c2) {
7461 while (locinput <= ST.maxpos &&
7462 UCHARAT(locinput) != ST.c1)
7466 while (locinput <= ST.maxpos
7467 && UCHARAT(locinput) != ST.c1
7468 && UCHARAT(locinput) != ST.c2)
7471 n = locinput - ST.oldloc;
7473 if (locinput > ST.maxpos)
7476 /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is
7477 * at b; check that everything between oldloc and
7478 * locinput matches */
7479 char *li = ST.oldloc;
7481 if (regrepeat(rex, &li, ST.A, reginfo, n, depth) < n)
7483 assert(n == REG_INFTY || locinput == li);
7485 CURLY_SETPAREN(ST.paren, ST.count);
7486 if (cur_eval && cur_eval->u.eval.close_paren &&
7487 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7490 PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
7493 NOT_REACHED; /* NOTREACHED */
7495 case CURLY_B_min_fail:
7496 /* failed to find B in a non-greedy match where c1,c2 invalid */
7498 REGCP_UNWIND(ST.cp);
7500 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7502 /* failed -- move forward one */
7504 char *li = locinput;
7505 if (!regrepeat(rex, &li, ST.A, reginfo, 1, depth)) {
7512 if (ST.count <= ST.max || (ST.max == REG_INFTY &&
7513 ST.count > 0)) /* count overflow ? */
7516 CURLY_SETPAREN(ST.paren, ST.count);
7517 if (cur_eval && cur_eval->u.eval.close_paren &&
7518 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7521 PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput);
7526 NOT_REACHED; /* NOTREACHED */
7529 /* a successful greedy match: now try to match B */
7530 if (cur_eval && cur_eval->u.eval.close_paren &&
7531 cur_eval->u.eval.close_paren == (U32)ST.paren) {
7535 bool could_match = locinput < reginfo->strend;
7537 /* If it could work, try it. */
7538 if (ST.c1 != CHRTEST_VOID && could_match) {
7539 if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target)
7541 could_match = memEQ(locinput,
7546 UTF8SKIP(locinput));
7549 could_match = UCHARAT(locinput) == ST.c1
7550 || UCHARAT(locinput) == ST.c2;
7553 if (ST.c1 == CHRTEST_VOID || could_match) {
7554 CURLY_SETPAREN(ST.paren, ST.count);
7555 PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
7557 NOT_REACHED; /* NOTREACHED */
7562 case CURLY_B_max_fail:
7563 /* failed to find B in a greedy match */
7565 REGCP_UNWIND(ST.cp);
7567 UNWIND_PAREN(ST.lastparen, ST.lastcloseparen);
7570 if (--ST.count < ST.min)
7572 locinput = HOPc(locinput, -1);
7573 goto curly_try_B_max;
7577 case END: /* last op of main pattern */
7580 /* we've just finished A in /(??{A})B/; now continue with B */
7582 st->u.eval.prev_rex = rex_sv; /* inner */
7584 /* Save *all* the positions. */
7585 st->u.eval.cp = regcppush(rex, 0, maxopenparen);
7586 rex_sv = cur_eval->u.eval.prev_rex;
7587 is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv));
7588 SET_reg_curpm(rex_sv);
7589 rex = ReANY(rex_sv);
7590 rexi = RXi_GET(rex);
7591 cur_curlyx = cur_eval->u.eval.prev_curlyx;
7593 REGCP_SET(st->u.eval.lastcp);
7595 /* Restore parens of the outer rex without popping the
7597 S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp,
7600 st->u.eval.prev_eval = cur_eval;
7601 cur_eval = cur_eval->u.eval.prev_eval;
7603 PerlIO_printf(Perl_debug_log, "%*s EVAL trying tail ... %"UVxf"\n",
7604 REPORT_CODE_OFF+depth*2, "",PTR2UV(cur_eval)););
7605 if ( nochange_depth )
7608 PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B,
7609 locinput); /* match B */
7612 if (locinput < reginfo->till) {
7613 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
7614 "%sMatch possible, but length=%ld is smaller than requested=%ld, failing!%s\n",
7616 (long)(locinput - startpos),
7617 (long)(reginfo->till - startpos),
7620 sayNO_SILENT; /* Cannot match: too short. */
7622 sayYES; /* Success! */
7624 case SUCCEED: /* successful SUSPEND/UNLESSM/IFMATCH/CURLYM */
7626 PerlIO_printf(Perl_debug_log,
7627 "%*s %ssubpattern success...%s\n",
7628 REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]));
7629 sayYES; /* Success! */
7632 #define ST st->u.ifmatch
7637 case SUSPEND: /* (?>A) */
7639 newstart = locinput;
7642 case UNLESSM: /* -ve lookaround: (?!A), or with flags, (?<!A) */
7644 goto ifmatch_trivial_fail_test;
7646 case IFMATCH: /* +ve lookaround: (?=A), or with flags, (?<=A) */
7648 ifmatch_trivial_fail_test:
7650 char * const s = HOPBACKc(locinput, scan->flags);
7655 sw = 1 - cBOOL(ST.wanted);
7659 next = scan + ARG(scan);
7667 newstart = locinput;
7671 ST.logical = logical;
7672 logical = 0; /* XXX: reset state of logical once it has been saved into ST */
7674 /* execute body of (?...A) */
7675 PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
7677 NOT_REACHED; /* NOTREACHED */
7680 case IFMATCH_A_fail: /* body of (?...A) failed */
7681 ST.wanted = !ST.wanted;
7684 case IFMATCH_A: /* body of (?...A) succeeded */
7686 sw = cBOOL(ST.wanted);
7688 else if (!ST.wanted)
7691 if (OP(ST.me) != SUSPEND) {
7692 /* restore old position except for (?>...) */
7693 locinput = st->locinput;
7695 scan = ST.me + ARG(ST.me);
7698 continue; /* execute B */
7702 case LONGJMP: /* alternative with many branches compiles to
7703 * (BRANCHJ; EXACT ...; LONGJMP ) x N */
7704 next = scan + ARG(scan);
7709 case COMMIT: /* (*COMMIT) */
7710 reginfo->cutpoint = reginfo->strend;
7713 case PRUNE: /* (*PRUNE) */
7715 sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7716 PUSH_STATE_GOTO(COMMIT_next, next, locinput);
7718 NOT_REACHED; /* NOTREACHED */
7720 case COMMIT_next_fail:
7724 case OPFAIL: /* (*FAIL) */
7727 NOT_REACHED; /* NOTREACHED */
7729 #define ST st->u.mark
7730 case MARKPOINT: /* (*MARK:foo) */
7731 ST.prev_mark = mark_state;
7732 ST.mark_name = sv_commit = sv_yes_mark
7733 = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7735 ST.mark_loc = locinput;
7736 PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
7738 NOT_REACHED; /* NOTREACHED */
7740 case MARKPOINT_next:
7741 mark_state = ST.prev_mark;
7744 NOT_REACHED; /* NOTREACHED */
7746 case MARKPOINT_next_fail:
7747 if (popmark && sv_eq(ST.mark_name,popmark))
7749 if (ST.mark_loc > startpoint)
7750 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7751 popmark = NULL; /* we found our mark */
7752 sv_commit = ST.mark_name;
7755 PerlIO_printf(Perl_debug_log,
7756 "%*s %ssetting cutpoint to mark:%"SVf"...%s\n",
7757 REPORT_CODE_OFF+depth*2, "",
7758 PL_colors[4], SVfARG(sv_commit), PL_colors[5]);
7761 mark_state = ST.prev_mark;
7762 sv_yes_mark = mark_state ?
7763 mark_state->u.mark.mark_name : NULL;
7766 NOT_REACHED; /* NOTREACHED */
7768 case SKIP: /* (*SKIP) */
7770 /* (*SKIP) : if we fail we cut here*/
7771 ST.mark_name = NULL;
7772 ST.mark_loc = locinput;
7773 PUSH_STATE_GOTO(SKIP_next,next, locinput);
7775 /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was,
7776 otherwise do nothing. Meaning we need to scan
7778 regmatch_state *cur = mark_state;
7779 SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
7782 if ( sv_eq( cur->u.mark.mark_name,
7785 ST.mark_name = find;
7786 PUSH_STATE_GOTO( SKIP_next, next, locinput);
7788 cur = cur->u.mark.prev_mark;
7791 /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */
7794 case SKIP_next_fail:
7796 /* (*CUT:NAME) - Set up to search for the name as we
7797 collapse the stack*/
7798 popmark = ST.mark_name;
7800 /* (*CUT) - No name, we cut here.*/
7801 if (ST.mark_loc > startpoint)
7802 reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1);
7803 /* but we set sv_commit to latest mark_name if there
7804 is one so they can test to see how things lead to this
7807 sv_commit=mark_state->u.mark.mark_name;
7812 NOT_REACHED; /* NOTREACHED */
7815 case LNBREAK: /* \R */
7816 if ((n=is_LNBREAK_safe(locinput, reginfo->strend, utf8_target))) {
7823 PerlIO_printf(Perl_error_log, "%"UVxf" %d\n",
7824 PTR2UV(scan), OP(scan));
7825 Perl_croak(aTHX_ "regexp memory corruption");
7827 /* this is a point to jump to in order to increment
7828 * locinput by one character */
7830 assert(!NEXTCHR_IS_EOS);
7832 locinput += PL_utf8skip[nextchr];
7833 /* locinput is allowed to go 1 char off the end, but not 2+ */
7834 if (locinput > reginfo->strend)
7843 /* switch break jumps here */
7844 scan = next; /* prepare to execute the next op and ... */
7845 continue; /* ... jump back to the top, reusing st */
7849 /* push a state that backtracks on success */
7850 st->u.yes.prev_yes_state = yes_state;
7854 /* push a new regex state, then continue at scan */
7856 regmatch_state *newst;
7859 regmatch_state *cur = st;
7860 regmatch_state *curyes = yes_state;
7862 regmatch_slab *slab = PL_regmatch_slab;
7863 for (;curd > -1;cur--,curd--) {
7864 if (cur < SLAB_FIRST(slab)) {
7866 cur = SLAB_LAST(slab);
7868 PerlIO_printf(Perl_error_log, "%*s#%-3d %-10s %s\n",
7869 REPORT_CODE_OFF + 2 + depth * 2,"",
7870 curd, PL_reg_name[cur->resume_state],
7871 (curyes == cur) ? "yes" : ""
7874 curyes = cur->u.yes.prev_yes_state;
7877 DEBUG_STATE_pp("push")
7880 st->locinput = locinput;
7882 if (newst > SLAB_LAST(PL_regmatch_slab))
7883 newst = S_push_slab(aTHX);
7884 PL_regmatch_state = newst;
7886 locinput = pushinput;
7894 * We get here only if there's trouble -- normally "case END" is
7895 * the terminating point.
7897 Perl_croak(aTHX_ "corrupted regexp pointers");
7900 NOT_REACHED; /* NOTREACHED */
7904 /* we have successfully completed a subexpression, but we must now
7905 * pop to the state marked by yes_state and continue from there */
7906 assert(st != yes_state);
7908 while (st != yes_state) {
7910 if (st < SLAB_FIRST(PL_regmatch_slab)) {
7911 PL_regmatch_slab = PL_regmatch_slab->prev;
7912 st = SLAB_LAST(PL_regmatch_slab);
7916 DEBUG_STATE_pp("pop (no final)");
7918 DEBUG_STATE_pp("pop (yes)");
7924 while (yes_state < SLAB_FIRST(PL_regmatch_slab)
7925 || yes_state > SLAB_LAST(PL_regmatch_slab))
7927 /* not in this slab, pop slab */
7928 depth -= (st - SLAB_FIRST(PL_regmatch_slab) + 1);
7929 PL_regmatch_slab = PL_regmatch_slab->prev;
7930 st = SLAB_LAST(PL_regmatch_slab);
7932 depth -= (st - yes_state);
7935 yes_state = st->u.yes.prev_yes_state;
7936 PL_regmatch_state = st;
7939 locinput= st->locinput;
7940 state_num = st->resume_state + no_final;
7941 goto reenter_switch;
7944 DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n",
7945 PL_colors[4], PL_colors[5]));
7947 if (reginfo->info_aux_eval) {
7948 /* each successfully executed (?{...}) block does the equivalent of
7949 * local $^R = do {...}
7950 * When popping the save stack, all these locals would be undone;
7951 * bypass this by setting the outermost saved $^R to the latest
7953 /* I dont know if this is needed or works properly now.
7954 * see code related to PL_replgv elsewhere in this file.
7957 if (oreplsv != GvSV(PL_replgv))
7958 sv_setsv(oreplsv, GvSV(PL_replgv));
7965 PerlIO_printf(Perl_debug_log,
7966 "%*s %sfailed...%s\n",
7967 REPORT_CODE_OFF+depth*2, "",
7968 PL_colors[4], PL_colors[5])
7980 /* there's a previous state to backtrack to */
7982 if (st < SLAB_FIRST(PL_regmatch_slab)) {
7983 PL_regmatch_slab = PL_regmatch_slab->prev;
7984 st = SLAB_LAST(PL_regmatch_slab);
7986 PL_regmatch_state = st;
7987 locinput= st->locinput;
7989 DEBUG_STATE_pp("pop");
7991 if (yes_state == st)
7992 yes_state = st->u.yes.prev_yes_state;
7994 state_num = st->resume_state + 1; /* failure = success + 1 */
7995 goto reenter_switch;
8000 if (rex->intflags & PREGf_VERBARG_SEEN) {
8001 SV *sv_err = get_sv("REGERROR", 1);
8002 SV *sv_mrk = get_sv("REGMARK", 1);
8004 sv_commit = &PL_sv_no;
8006 sv_yes_mark = &PL_sv_yes;
8009 sv_commit = &PL_sv_yes;
8010 sv_yes_mark = &PL_sv_no;
8014 sv_setsv(sv_err, sv_commit);
8015 sv_setsv(sv_mrk, sv_yes_mark);
8019 if (last_pushed_cv) {
8022 PERL_UNUSED_VAR(SP);
8025 assert(!result || locinput - reginfo->strbeg >= 0);
8026 return result ? locinput - reginfo->strbeg : -1;
8030 - regrepeat - repeatedly match something simple, report how many
8032 * What 'simple' means is a node which can be the operand of a quantifier like
8035 * startposp - pointer a pointer to the start position. This is updated
8036 * to point to the byte following the highest successful
8038 * p - the regnode to be repeatedly matched against.
8039 * reginfo - struct holding match state, such as strend
8040 * max - maximum number of things to match.
8041 * depth - (for debugging) backtracking depth.
8044 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
8045 regmatch_info *const reginfo, I32 max, int depth)
8047 char *scan; /* Pointer to current position in target string */
8049 char *loceol = reginfo->strend; /* local version */
8050 I32 hardcount = 0; /* How many matches so far */
8051 bool utf8_target = reginfo->is_utf8_target;
8052 unsigned int to_complement = 0; /* Invert the result? */
8054 _char_class_number classnum;
8056 PERL_UNUSED_ARG(depth);
8059 PERL_ARGS_ASSERT_REGREPEAT;
8062 if (max == REG_INFTY)
8064 else if (! utf8_target && loceol - scan > max)
8065 loceol = scan + max;
8067 /* Here, for the case of a non-UTF-8 target we have adjusted <loceol> down
8068 * to the maximum of how far we should go in it (leaving it set to the real
8069 * end, if the maximum permissible would take us beyond that). This allows
8070 * us to make the loop exit condition that we haven't gone past <loceol> to
8071 * also mean that we haven't exceeded the max permissible count, saving a
8072 * test each time through the loop. But it assumes that the OP matches a
8073 * single byte, which is true for most of the OPs below when applied to a
8074 * non-UTF-8 target. Those relatively few OPs that don't have this
8075 * characteristic will have to compensate.
8077 * There is no adjustment for UTF-8 targets, as the number of bytes per
8078 * character varies. OPs will have to test both that the count is less
8079 * than the max permissible (using <hardcount> to keep track), and that we
8080 * are still within the bounds of the string (using <loceol>. A few OPs
8081 * match a single byte no matter what the encoding. They can omit the max
8082 * test if, for the UTF-8 case, they do the adjustment that was skipped
8085 * Thus, the code above sets things up for the common case; and exceptional
8086 * cases need extra work; the common case is to make sure <scan> doesn't
8087 * go past <loceol>, and for UTF-8 to also use <hardcount> to make sure the
8088 * count doesn't exceed the maximum permissible */
8093 while (scan < loceol && hardcount < max && *scan != '\n') {
8094 scan += UTF8SKIP(scan);
8098 while (scan < loceol && *scan != '\n')
8104 while (scan < loceol && hardcount < max) {
8105 scan += UTF8SKIP(scan);
8112 case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
8113 if (utf8_target && loceol - scan > max) {
8115 /* <loceol> hadn't been adjusted in the UTF-8 case */
8123 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8124 if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
8125 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
8129 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8133 /* Can use a simple loop if the pattern char to match on is invariant
8134 * under UTF-8, or both target and pattern aren't UTF-8. Note that we
8135 * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
8136 * true iff it doesn't matter if the argument is in UTF-8 or not */
8137 if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) {
8138 if (utf8_target && loceol - scan > max) {
8139 /* We didn't adjust <loceol> because is UTF-8, but ok to do so,
8140 * since here, to match at all, 1 char == 1 byte */
8141 loceol = scan + max;
8143 while (scan < loceol && UCHARAT(scan) == c) {
8147 else if (reginfo->is_utf8_pat) {
8149 STRLEN scan_char_len;
8151 /* When both target and pattern are UTF-8, we have to do
8153 while (hardcount < max
8155 && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p)
8156 && memEQ(scan, STRING(p), scan_char_len))
8158 scan += scan_char_len;
8162 else if (! UTF8_IS_ABOVE_LATIN1(c)) {
8164 /* Target isn't utf8; convert the character in the UTF-8
8165 * pattern to non-UTF8, and do a simple loop */
8166 c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
8167 while (scan < loceol && UCHARAT(scan) == c) {
8170 } /* else pattern char is above Latin1, can't possibly match the
8175 /* Here, the string must be utf8; pattern isn't, and <c> is
8176 * different in utf8 than not, so can't compare them directly.
8177 * Outside the loop, find the two utf8 bytes that represent c, and
8178 * then look for those in sequence in the utf8 string */
8179 U8 high = UTF8_TWO_BYTE_HI(c);
8180 U8 low = UTF8_TWO_BYTE_LO(c);
8182 while (hardcount < max
8183 && scan + 1 < loceol
8184 && UCHARAT(scan) == high
8185 && UCHARAT(scan + 1) == low)
8193 case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
8194 assert(! reginfo->is_utf8_pat);
8197 utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
8201 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8202 utf8_flags = FOLDEQ_LOCALE;
8205 case EXACTF: /* This node only generated for non-utf8 patterns */
8206 assert(! reginfo->is_utf8_pat);
8211 if (! utf8_target) {
8214 utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
8215 | FOLDEQ_S2_FOLDS_SANE;
8220 utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
8224 U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1];
8226 assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
8228 if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8,
8231 if (c1 == CHRTEST_VOID) {
8232 /* Use full Unicode fold matching */
8233 char *tmpeol = reginfo->strend;
8234 STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1;
8235 while (hardcount < max
8236 && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target,
8237 STRING(p), NULL, pat_len,
8238 reginfo->is_utf8_pat, utf8_flags))
8241 tmpeol = reginfo->strend;
8245 else if (utf8_target) {
8247 while (scan < loceol
8249 && memEQ(scan, c1_utf8, UTF8SKIP(scan)))
8251 scan += UTF8SKIP(scan);
8256 while (scan < loceol
8258 && (memEQ(scan, c1_utf8, UTF8SKIP(scan))
8259 || memEQ(scan, c2_utf8, UTF8SKIP(scan))))
8261 scan += UTF8SKIP(scan);
8266 else if (c1 == c2) {
8267 while (scan < loceol && UCHARAT(scan) == c1) {
8272 while (scan < loceol &&
8273 (UCHARAT(scan) == c1 || UCHARAT(scan) == c2))
8282 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8286 while (hardcount < max
8288 && reginclass(prog, p, (U8*)scan, (U8*) loceol, utf8_target))
8290 scan += UTF8SKIP(scan);
8294 while (scan < loceol && REGINCLASS(prog, p, (U8*)scan))
8299 /* The argument (FLAGS) to all the POSIX node types is the class number */
8306 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8307 if (! utf8_target) {
8308 while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
8314 while (hardcount < max && scan < loceol
8315 && to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(p),
8318 scan += UTF8SKIP(scan);
8331 if (utf8_target && loceol - scan > max) {
8333 /* We didn't adjust <loceol> at the beginning of this routine
8334 * because is UTF-8, but it is actually ok to do so, since here, to
8335 * match, 1 char == 1 byte. */
8336 loceol = scan + max;
8338 while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) {
8351 if (! utf8_target) {
8352 while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) {
8358 /* The complement of something that matches only ASCII matches all
8359 * non-ASCII, plus everything in ASCII that isn't in the class. */
8360 while (hardcount < max && scan < loceol
8361 && (! isASCII_utf8(scan)
8362 || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
8364 scan += UTF8SKIP(scan);
8375 if (! utf8_target) {
8376 while (scan < loceol && to_complement
8377 ^ cBOOL(_generic_isCC((U8) *scan, FLAGS(p))))
8384 classnum = (_char_class_number) FLAGS(p);
8385 if (classnum < _FIRST_NON_SWASH_CC) {
8387 /* Here, a swash is needed for above-Latin1 code points.
8388 * Process as many Latin1 code points using the built-in rules.
8389 * Go to another loop to finish processing upon encountering
8390 * the first Latin1 code point. We could do that in this loop
8391 * as well, but the other way saves having to test if the swash
8392 * has been loaded every time through the loop: extra space to
8394 while (hardcount < max && scan < loceol) {
8395 if (UTF8_IS_INVARIANT(*scan)) {
8396 if (! (to_complement ^ cBOOL(_generic_isCC((U8) *scan,
8403 else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
8404 if (! (to_complement
8405 ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
8414 goto found_above_latin1;
8421 /* For these character classes, the knowledge of how to handle
8422 * every code point is compiled in to Perl via a macro. This
8423 * code is written for making the loops as tight as possible.
8424 * It could be refactored to save space instead */
8426 case _CC_ENUM_SPACE:
8427 while (hardcount < max
8429 && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
8431 scan += UTF8SKIP(scan);
8435 case _CC_ENUM_BLANK:
8436 while (hardcount < max
8438 && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
8440 scan += UTF8SKIP(scan);
8444 case _CC_ENUM_XDIGIT:
8445 while (hardcount < max
8447 && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
8449 scan += UTF8SKIP(scan);
8453 case _CC_ENUM_VERTSPACE:
8454 while (hardcount < max
8456 && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
8458 scan += UTF8SKIP(scan);
8462 case _CC_ENUM_CNTRL:
8463 while (hardcount < max
8465 && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
8467 scan += UTF8SKIP(scan);
8472 Perl_croak(aTHX_ "panic: regrepeat() node %d='%s' has an unexpected character class '%d'", OP(p), PL_reg_name[OP(p)], classnum);
8478 found_above_latin1: /* Continuation of POSIXU and NPOSIXU */
8480 /* Load the swash if not already present */
8481 if (! PL_utf8_swash_ptrs[classnum]) {
8482 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
8483 PL_utf8_swash_ptrs[classnum] = _core_swash_init(
8487 PL_XPosix_ptrs[classnum], &flags);
8490 while (hardcount < max && scan < loceol
8491 && to_complement ^ cBOOL(_generic_utf8(
8494 swash_fetch(PL_utf8_swash_ptrs[classnum],
8498 scan += UTF8SKIP(scan);
8505 while (hardcount < max && scan < loceol &&
8506 (c=is_LNBREAK_utf8_safe(scan, loceol))) {
8511 /* LNBREAK can match one or two latin chars, which is ok, but we
8512 * have to use hardcount in this situation, and throw away the
8513 * adjustment to <loceol> done before the switch statement */
8514 loceol = reginfo->strend;
8515 while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) {
8524 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
8538 /* These are all 0 width, so match right here or not at all. */
8542 Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
8544 NOT_REACHED; /* NOTREACHED */
8551 c = scan - *startposp;
8555 GET_RE_DEBUG_FLAGS_DECL;
8557 SV * const prop = sv_newmortal();
8558 regprop(prog, prop, p, reginfo, NULL);
8559 PerlIO_printf(Perl_debug_log,
8560 "%*s %s can match %"IVdf" times out of %"IVdf"...\n",
8561 REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
8569 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
8571 - regclass_swash - prepare the utf8 swash. Wraps the shared core version to
8572 create a copy so that changes the caller makes won't change the shared one.
8573 If <altsvp> is non-null, will return NULL in it, for back-compat.
8576 Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp, SV **altsvp)
8578 PERL_ARGS_ASSERT_REGCLASS_SWASH;
8584 return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
8587 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
8590 - reginclass - determine if a character falls into a character class
8592 n is the ANYOF-type regnode
8593 p is the target string
8594 p_end points to one byte beyond the end of the target string
8595 utf8_target tells whether p is in UTF-8.
8597 Returns true if matched; false otherwise.
8599 Note that this can be a synthetic start class, a combination of various
8600 nodes, so things you think might be mutually exclusive, such as locale,
8601 aren't. It can match both locale and non-locale
8606 S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target)
8609 const char flags = ANYOF_FLAGS(n);
8613 PERL_ARGS_ASSERT_REGINCLASS;
8615 /* If c is not already the code point, get it. Note that
8616 * UTF8_IS_INVARIANT() works even if not in UTF-8 */
8617 if (! UTF8_IS_INVARIANT(c) && utf8_target) {
8619 c = utf8n_to_uvchr(p, p_end - p, &c_len,
8620 (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV)
8621 | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY);
8622 /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for
8623 * UTF8_ALLOW_FFFF */
8624 if (c_len == (STRLEN)-1)
8625 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
8626 if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
8627 _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
8631 /* If this character is potentially in the bitmap, check it */
8632 if (c < NUM_ANYOF_CODE_POINTS) {
8633 if (ANYOF_BITMAP_TEST(n, c))
8635 else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
8641 else if (flags & ANYOF_LOCALE_FLAGS) {
8642 if ((flags & ANYOF_LOC_FOLD)
8644 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
8648 else if (ANYOF_POSIXL_TEST_ANY_SET(n)
8652 /* The data structure is arranged so bits 0, 2, 4, ... are set
8653 * if the class includes the Posix character class given by
8654 * bit/2; and 1, 3, 5, ... are set if the class includes the
8655 * complemented Posix class given by int(bit/2). So we loop
8656 * through the bits, each time changing whether we complement
8657 * the result or not. Suppose for the sake of illustration
8658 * that bits 0-3 mean respectively, \w, \W, \s, \S. If bit 0
8659 * is set, it means there is a match for this ANYOF node if the
8660 * character is in the class given by the expression (0 / 2 = 0
8661 * = \w). If it is in that class, isFOO_lc() will return 1,
8662 * and since 'to_complement' is 0, the result will stay TRUE,
8663 * and we exit the loop. Suppose instead that bit 0 is 0, but
8664 * bit 1 is 1. That means there is a match if the character
8665 * matches \W. We won't bother to call isFOO_lc() on bit 0,
8666 * but will on bit 1. On the second iteration 'to_complement'
8667 * will be 1, so the exclusive or will reverse things, so we
8668 * are testing for \W. On the third iteration, 'to_complement'
8669 * will be 0, and we would be testing for \s; the fourth
8670 * iteration would test for \S, etc.
8672 * Note that this code assumes that all the classes are closed
8673 * under folding. For example, if a character matches \w, then
8674 * its fold does too; and vice versa. This should be true for
8675 * any well-behaved locale for all the currently defined Posix
8676 * classes, except for :lower: and :upper:, which are handled
8677 * by the pseudo-class :cased: which matches if either of the
8678 * other two does. To get rid of this assumption, an outer
8679 * loop could be used below to iterate over both the source
8680 * character, and its fold (if different) */
8683 int to_complement = 0;
8685 while (count < ANYOF_MAX) {
8686 if (ANYOF_POSIXL_TEST(n, count)
8687 && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c)))
8700 /* If the bitmap didn't (or couldn't) match, and something outside the
8701 * bitmap could match, try that. */
8703 if (c >= NUM_ANYOF_CODE_POINTS
8704 && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
8706 match = TRUE; /* Everything above the bitmap matches */
8708 else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
8709 || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
8710 || ((flags & ANYOF_LOC_FOLD)
8711 && IN_UTF8_CTYPE_LOCALE
8712 && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
8714 SV* only_utf8_locale = NULL;
8715 SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
8716 &only_utf8_locale, NULL);
8722 } else { /* Convert to utf8 */
8723 utf8_p = utf8_buffer;
8724 append_utf8_from_native_byte(*p, &utf8_p);
8725 utf8_p = utf8_buffer;
8728 if (swash_fetch(sw, utf8_p, TRUE)) {
8732 if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) {
8733 match = _invlist_contains_cp(only_utf8_locale, c);
8737 if (UNICODE_IS_SUPER(c)
8738 && (flags & ANYOF_WARN_SUPER)
8739 && ckWARN_d(WARN_NON_UNICODE))
8741 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
8742 "Matched non-Unicode code point 0x%04"UVXf" against Unicode property; may not be portable", c);
8746 #if ANYOF_INVERT != 1
8747 /* Depending on compiler optimization cBOOL takes time, so if don't have to
8749 # error ANYOF_INVERT needs to be set to 1, or guarded with cBOOL below,
8752 /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */
8753 return (flags & ANYOF_INVERT) ^ match;
8757 S_reghop3(U8 *s, SSize_t off, const U8* lim)
8759 /* return the position 'off' UTF-8 characters away from 's', forward if
8760 * 'off' >= 0, backwards if negative. But don't go outside of position
8761 * 'lim', which better be < s if off < 0 */
8763 PERL_ARGS_ASSERT_REGHOP3;
8766 while (off-- && s < lim) {
8767 /* XXX could check well-formedness here */
8772 while (off++ && s > lim) {
8774 if (UTF8_IS_CONTINUED(*s)) {
8775 while (s > lim && UTF8_IS_CONTINUATION(*s))
8778 /* XXX could check well-formedness here */
8785 S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim)
8787 PERL_ARGS_ASSERT_REGHOP4;
8790 while (off-- && s < rlim) {
8791 /* XXX could check well-formedness here */
8796 while (off++ && s > llim) {
8798 if (UTF8_IS_CONTINUED(*s)) {
8799 while (s > llim && UTF8_IS_CONTINUATION(*s))
8802 /* XXX could check well-formedness here */
8808 /* like reghop3, but returns NULL on overrun, rather than returning last
8812 S_reghopmaybe3(U8* s, SSize_t off, const U8* lim)
8814 PERL_ARGS_ASSERT_REGHOPMAYBE3;
8817 while (off-- && s < lim) {
8818 /* XXX could check well-formedness here */
8825 while (off++ && s > lim) {
8827 if (UTF8_IS_CONTINUED(*s)) {
8828 while (s > lim && UTF8_IS_CONTINUATION(*s))
8831 /* XXX could check well-formedness here */
8840 /* when executing a regex that may have (?{}), extra stuff needs setting
8841 up that will be visible to the called code, even before the current
8842 match has finished. In particular:
8844 * $_ is localised to the SV currently being matched;
8845 * pos($_) is created if necessary, ready to be updated on each call-out
8847 * a fake PMOP is created that can be set to PL_curpm (normally PL_curpm
8848 isn't set until the current pattern is successfully finished), so that
8849 $1 etc of the match-so-far can be seen;
8850 * save the old values of subbeg etc of the current regex, and set then
8851 to the current string (again, this is normally only done at the end
8856 S_setup_eval_state(pTHX_ regmatch_info *const reginfo)
8859 regexp *const rex = ReANY(reginfo->prog);
8860 regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval;
8862 eval_state->rex = rex;
8865 /* Make $_ available to executed code. */
8866 if (reginfo->sv != DEFSV) {
8868 DEFSV_set(reginfo->sv);
8871 if (!(mg = mg_find_mglob(reginfo->sv))) {
8872 /* prepare for quick setting of pos */
8873 mg = sv_magicext_mglob(reginfo->sv);
8876 eval_state->pos_magic = mg;
8877 eval_state->pos = mg->mg_len;
8878 eval_state->pos_flags = mg->mg_flags;
8881 eval_state->pos_magic = NULL;
8883 if (!PL_reg_curpm) {
8884 /* PL_reg_curpm is a fake PMOP that we can attach the current
8885 * regex to and point PL_curpm at, so that $1 et al are visible
8886 * within a /(?{})/. It's just allocated once per interpreter the
8887 * first time its needed */
8888 Newxz(PL_reg_curpm, 1, PMOP);
8891 SV* const repointer = &PL_sv_undef;
8892 /* this regexp is also owned by the new PL_reg_curpm, which
8893 will try to free it. */
8894 av_push(PL_regex_padav, repointer);
8895 PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav);
8896 PL_regex_pad = AvARRAY(PL_regex_padav);
8900 SET_reg_curpm(reginfo->prog);
8901 eval_state->curpm = PL_curpm;
8902 PL_curpm = PL_reg_curpm;
8903 if (RXp_MATCH_COPIED(rex)) {
8904 /* Here is a serious problem: we cannot rewrite subbeg,
8905 since it may be needed if this match fails. Thus
8906 $` inside (?{}) could fail... */
8907 eval_state->subbeg = rex->subbeg;
8908 eval_state->sublen = rex->sublen;
8909 eval_state->suboffset = rex->suboffset;
8910 eval_state->subcoffset = rex->subcoffset;
8912 eval_state->saved_copy = rex->saved_copy;
8914 RXp_MATCH_COPIED_off(rex);
8917 eval_state->subbeg = NULL;
8918 rex->subbeg = (char *)reginfo->strbeg;
8920 rex->subcoffset = 0;
8921 rex->sublen = reginfo->strend - reginfo->strbeg;
8925 /* destructor to clear up regmatch_info_aux and regmatch_info_aux_eval */
8928 S_cleanup_regmatch_info_aux(pTHX_ void *arg)
8930 regmatch_info_aux *aux = (regmatch_info_aux *) arg;
8931 regmatch_info_aux_eval *eval_state = aux->info_aux_eval;
8934 Safefree(aux->poscache);
8938 /* undo the effects of S_setup_eval_state() */
8940 if (eval_state->subbeg) {
8941 regexp * const rex = eval_state->rex;
8942 rex->subbeg = eval_state->subbeg;
8943 rex->sublen = eval_state->sublen;
8944 rex->suboffset = eval_state->suboffset;
8945 rex->subcoffset = eval_state->subcoffset;
8947 rex->saved_copy = eval_state->saved_copy;
8949 RXp_MATCH_COPIED_on(rex);
8951 if (eval_state->pos_magic)
8953 eval_state->pos_magic->mg_len = eval_state->pos;
8954 eval_state->pos_magic->mg_flags =
8955 (eval_state->pos_magic->mg_flags & ~MGf_BYTES)
8956 | (eval_state->pos_flags & MGf_BYTES);
8959 PL_curpm = eval_state->curpm;
8962 PL_regmatch_state = aux->old_regmatch_state;
8963 PL_regmatch_slab = aux->old_regmatch_slab;
8965 /* free all slabs above current one - this must be the last action
8966 * of this function, as aux and eval_state are allocated within
8967 * slabs and may be freed here */
8969 s = PL_regmatch_slab->next;
8971 PL_regmatch_slab->next = NULL;
8973 regmatch_slab * const osl = s;
8982 S_to_utf8_substr(pTHX_ regexp *prog)
8984 /* Converts substr fields in prog from bytes to UTF-8, calling fbm_compile
8985 * on the converted value */
8989 PERL_ARGS_ASSERT_TO_UTF8_SUBSTR;
8992 if (prog->substrs->data[i].substr
8993 && !prog->substrs->data[i].utf8_substr) {
8994 SV* const sv = newSVsv(prog->substrs->data[i].substr);
8995 prog->substrs->data[i].utf8_substr = sv;
8996 sv_utf8_upgrade(sv);
8997 if (SvVALID(prog->substrs->data[i].substr)) {
8998 if (SvTAIL(prog->substrs->data[i].substr)) {
8999 /* Trim the trailing \n that fbm_compile added last
9001 SvCUR_set(sv, SvCUR(sv) - 1);
9002 /* Whilst this makes the SV technically "invalid" (as its
9003 buffer is no longer followed by "\0") when fbm_compile()
9004 adds the "\n" back, a "\0" is restored. */
9005 fbm_compile(sv, FBMcf_TAIL);
9009 if (prog->substrs->data[i].substr == prog->check_substr)
9010 prog->check_utf8 = sv;
9016 S_to_byte_substr(pTHX_ regexp *prog)
9018 /* Converts substr fields in prog from UTF-8 to bytes, calling fbm_compile
9019 * on the converted value; returns FALSE if can't be converted. */
9023 PERL_ARGS_ASSERT_TO_BYTE_SUBSTR;
9026 if (prog->substrs->data[i].utf8_substr
9027 && !prog->substrs->data[i].substr) {
9028 SV* sv = newSVsv(prog->substrs->data[i].utf8_substr);
9029 if (! sv_utf8_downgrade(sv, TRUE)) {
9032 if (SvVALID(prog->substrs->data[i].utf8_substr)) {
9033 if (SvTAIL(prog->substrs->data[i].utf8_substr)) {
9034 /* Trim the trailing \n that fbm_compile added last
9036 SvCUR_set(sv, SvCUR(sv) - 1);
9037 fbm_compile(sv, FBMcf_TAIL);
9041 prog->substrs->data[i].substr = sv;
9042 if (prog->substrs->data[i].utf8_substr == prog->check_utf8)
9043 prog->check_substr = sv;
9051 * ex: set ts=8 sts=4 sw=4 et: