5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 extern const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
100 #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
101 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
109 # if defined(BUGGY_MSC6)
110 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
111 # pragma optimize("a",off)
112 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
113 # pragma optimize("w",on )
114 # endif /* BUGGY_MSC6 */
118 #define STATIC static
122 typedef struct RExC_state_t {
123 U32 flags; /* RXf_* are we folding, multilining? */
124 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
125 char *precomp; /* uncompiled string. */
126 REGEXP *rx_sv; /* The SV that is the regexp. */
127 regexp *rx; /* perl core regexp structure */
128 regexp_internal *rxi; /* internal data for regexp object pprivate field */
129 char *start; /* Start of input for compile */
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
132 I32 whilem_seen; /* number of WHILEM in this expr */
133 regnode *emit_start; /* Start of emitted-code area */
134 regnode *emit_bound; /* First regnode outside of the allocated space */
135 regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
136 I32 naughty; /* How bad is this pattern? */
137 I32 sawback; /* Did we see \1, ...? */
139 I32 size; /* Code size. */
140 I32 npar; /* Capture buffer count, (OPEN). */
141 I32 cpar; /* Capture buffer count, (CLOSE). */
142 I32 nestroot; /* root parens we are in - used by accept */
145 regnode **open_parens; /* pointers to open parens */
146 regnode **close_parens; /* pointers to close parens */
147 regnode *opend; /* END node in program */
148 I32 utf8; /* whether the pattern is utf8 or not */
149 I32 orig_utf8; /* whether the pattern was originally in utf8 */
150 /* XXX use this for future optimisation of case
151 * where pattern must be upgraded to utf8. */
152 I32 uni_semantics; /* If a d charset modifier should use unicode
153 rules, even if the pattern is not in
155 HV *paren_names; /* Paren names */
157 regnode **recurse; /* Recurse regops */
158 I32 recurse_count; /* Number of recurse regops */
161 I32 override_recoding;
162 I32 in_multi_char_class;
163 struct reg_code_block *code_blocks; /* positions of literal (?{})
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
168 char *starttry; /* -Dr: where regtry was called. */
169 #define RExC_starttry (pRExC_state->starttry)
171 SV *runtime_code_qr; /* qr with the runtime code blocks */
173 const char *lastparse;
175 AV *paren_name_list; /* idx -> name */
176 #define RExC_lastparse (pRExC_state->lastparse)
177 #define RExC_lastnum (pRExC_state->lastnum)
178 #define RExC_paren_name_list (pRExC_state->paren_name_list)
182 #define RExC_flags (pRExC_state->flags)
183 #define RExC_pm_flags (pRExC_state->pm_flags)
184 #define RExC_precomp (pRExC_state->precomp)
185 #define RExC_rx_sv (pRExC_state->rx_sv)
186 #define RExC_rx (pRExC_state->rx)
187 #define RExC_rxi (pRExC_state->rxi)
188 #define RExC_start (pRExC_state->start)
189 #define RExC_end (pRExC_state->end)
190 #define RExC_parse (pRExC_state->parse)
191 #define RExC_whilem_seen (pRExC_state->whilem_seen)
192 #ifdef RE_TRACK_PATTERN_OFFSETS
193 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
195 #define RExC_emit (pRExC_state->emit)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_npar (pRExC_state->npar)
203 #define RExC_nestroot (pRExC_state->nestroot)
204 #define RExC_extralen (pRExC_state->extralen)
205 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
206 #define RExC_utf8 (pRExC_state->utf8)
207 #define RExC_uni_semantics (pRExC_state->uni_semantics)
208 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
209 #define RExC_open_parens (pRExC_state->open_parens)
210 #define RExC_close_parens (pRExC_state->close_parens)
211 #define RExC_opend (pRExC_state->opend)
212 #define RExC_paren_names (pRExC_state->paren_names)
213 #define RExC_recurse (pRExC_state->recurse)
214 #define RExC_recurse_count (pRExC_state->recurse_count)
215 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
216 #define RExC_contains_locale (pRExC_state->contains_locale)
217 #define RExC_override_recoding (pRExC_state->override_recoding)
218 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
221 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
222 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
223 ((*s) == '{' && regcurly(s)))
226 #undef SPSTART /* dratted cpp namespace... */
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
235 * character. Note that this is not the same thing as REGNODE_SIMPLE */
237 #define SPSTART 0x04 /* Starts with * or +. */
238 #define TRYAGAIN 0x08 /* Weeded out a declaration. */
239 #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
241 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
243 /* whether trie related optimizations are enabled */
244 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
245 #define TRIE_STUDY_OPT
246 #define FULL_TRIE_STUDY
252 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
253 #define PBITVAL(paren) (1 << ((paren) & 7))
254 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
255 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
256 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
258 /* If not already in utf8, do a longjmp back to the beginning */
259 #define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
260 #define REQUIRE_UTF8 STMT_START { \
261 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
264 /* About scan_data_t.
266 During optimisation we recurse through the regexp program performing
267 various inplace (keyhole style) optimisations. In addition study_chunk
268 and scan_commit populate this data structure with information about
269 what strings MUST appear in the pattern. We look for the longest
270 string that must appear at a fixed location, and we look for the
271 longest string that may appear at a floating location. So for instance
276 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
277 strings (because they follow a .* construct). study_chunk will identify
278 both FOO and BAR as being the longest fixed and floating strings respectively.
280 The strings can be composites, for instance
284 will result in a composite fixed substring 'foo'.
286 For each string some basic information is maintained:
288 - offset or min_offset
289 This is the position the string must appear at, or not before.
290 It also implicitly (when combined with minlenp) tells us how many
291 characters must match before the string we are searching for.
292 Likewise when combined with minlenp and the length of the string it
293 tells us how many characters must appear after the string we have
297 Only used for floating strings. This is the rightmost point that
298 the string can appear at. If set to I32 max it indicates that the
299 string can occur infinitely far to the right.
302 A pointer to the minimum number of characters of the pattern that the
303 string was found inside. This is important as in the case of positive
304 lookahead or positive lookbehind we can have multiple patterns
309 The minimum length of the pattern overall is 3, the minimum length
310 of the lookahead part is 3, but the minimum length of the part that
311 will actually match is 1. So 'FOO's minimum length is 3, but the
312 minimum length for the F is 1. This is important as the minimum length
313 is used to determine offsets in front of and behind the string being
314 looked for. Since strings can be composites this is the length of the
315 pattern at the time it was committed with a scan_commit. Note that
316 the length is calculated by study_chunk, so that the minimum lengths
317 are not known until the full pattern has been compiled, thus the
318 pointer to the value.
322 In the case of lookbehind the string being searched for can be
323 offset past the start point of the final matching string.
324 If this value was just blithely removed from the min_offset it would
325 invalidate some of the calculations for how many chars must match
326 before or after (as they are derived from min_offset and minlen and
327 the length of the string being searched for).
328 When the final pattern is compiled and the data is moved from the
329 scan_data_t structure into the regexp structure the information
330 about lookbehind is factored in, with the information that would
331 have been lost precalculated in the end_shift field for the
334 The fields pos_min and pos_delta are used to store the minimum offset
335 and the delta to the maximum offset at the current point in the pattern.
339 typedef struct scan_data_t {
340 /*I32 len_min; unused */
341 /*I32 len_delta; unused */
345 I32 last_end; /* min value, <0 unless valid. */
348 SV **longest; /* Either &l_fixed, or &l_float. */
349 SV *longest_fixed; /* longest fixed string found in pattern */
350 I32 offset_fixed; /* offset where it starts */
351 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
352 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
353 SV *longest_float; /* longest floating string found in pattern */
354 I32 offset_float_min; /* earliest point in string it can appear */
355 I32 offset_float_max; /* latest point in string it can appear */
356 I32 *minlen_float; /* pointer to the minlen relevant to the string */
357 I32 lookbehind_float; /* is the position of the string modified by LB */
361 struct regnode_charclass_class *start_class;
365 * Forward declarations for pregcomp()'s friends.
368 static const scan_data_t zero_scan_data =
369 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
371 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
372 #define SF_BEFORE_SEOL 0x0001
373 #define SF_BEFORE_MEOL 0x0002
374 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
375 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
378 # define SF_FIX_SHIFT_EOL (0+2)
379 # define SF_FL_SHIFT_EOL (0+4)
381 # define SF_FIX_SHIFT_EOL (+2)
382 # define SF_FL_SHIFT_EOL (+4)
385 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
386 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
388 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
389 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
390 #define SF_IS_INF 0x0040
391 #define SF_HAS_PAR 0x0080
392 #define SF_IN_PAR 0x0100
393 #define SF_HAS_EVAL 0x0200
394 #define SCF_DO_SUBSTR 0x0400
395 #define SCF_DO_STCLASS_AND 0x0800
396 #define SCF_DO_STCLASS_OR 0x1000
397 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
398 #define SCF_WHILEM_VISITED_POS 0x2000
400 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
401 #define SCF_SEEN_ACCEPT 0x8000
403 #define UTF cBOOL(RExC_utf8)
405 /* The enums for all these are ordered so things work out correctly */
406 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
407 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
408 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
409 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
410 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
411 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
412 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
414 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
416 #define OOB_NAMEDCLASS -1
418 /* There is no code point that is out-of-bounds, so this is problematic. But
419 * its only current use is to initialize a variable that is always set before
421 #define OOB_UNICODE 0xDEADBEEF
423 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
424 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
427 /* length of regex to show in messages that don't mark a position within */
428 #define RegexLengthToShowInErrorMessages 127
431 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
432 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
433 * op/pragma/warn/regcomp.
435 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
436 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
438 #define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
441 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
442 * arg. Show regex, up to a maximum length. If it's too long, chop and add
445 #define _FAIL(code) STMT_START { \
446 const char *ellipses = ""; \
447 IV len = RExC_end - RExC_precomp; \
450 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
451 if (len > RegexLengthToShowInErrorMessages) { \
452 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
453 len = RegexLengthToShowInErrorMessages - 10; \
459 #define FAIL(msg) _FAIL( \
460 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
461 msg, (int)len, RExC_precomp, ellipses))
463 #define FAIL2(msg,arg) _FAIL( \
464 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
465 arg, (int)len, RExC_precomp, ellipses))
468 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
470 #define Simple_vFAIL(m) STMT_START { \
471 const IV offset = RExC_parse - RExC_precomp; \
472 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
473 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
477 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
479 #define vFAIL(m) STMT_START { \
481 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
486 * Like Simple_vFAIL(), but accepts two arguments.
488 #define Simple_vFAIL2(m,a1) STMT_START { \
489 const IV offset = RExC_parse - RExC_precomp; \
490 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
491 (int)offset, RExC_precomp, RExC_precomp + offset); \
495 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
497 #define vFAIL2(m,a1) STMT_START { \
499 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
500 Simple_vFAIL2(m, a1); \
505 * Like Simple_vFAIL(), but accepts three arguments.
507 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
508 const IV offset = RExC_parse - RExC_precomp; \
509 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
510 (int)offset, RExC_precomp, RExC_precomp + offset); \
514 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
516 #define vFAIL3(m,a1,a2) STMT_START { \
518 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
519 Simple_vFAIL3(m, a1, a2); \
523 * Like Simple_vFAIL(), but accepts four arguments.
525 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
526 const IV offset = RExC_parse - RExC_precomp; \
527 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
528 (int)offset, RExC_precomp, RExC_precomp + offset); \
531 #define ckWARNreg(loc,m) STMT_START { \
532 const IV offset = loc - RExC_precomp; \
533 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
534 (int)offset, RExC_precomp, RExC_precomp + offset); \
537 #define ckWARNregdep(loc,m) STMT_START { \
538 const IV offset = loc - RExC_precomp; \
539 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
541 (int)offset, RExC_precomp, RExC_precomp + offset); \
544 #define ckWARN2regdep(loc,m, a1) STMT_START { \
545 const IV offset = loc - RExC_precomp; \
546 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
548 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
551 #define ckWARN2reg(loc, m, a1) STMT_START { \
552 const IV offset = loc - RExC_precomp; \
553 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
554 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
557 #define vWARN3(loc, m, a1, a2) STMT_START { \
558 const IV offset = loc - RExC_precomp; \
559 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
560 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
563 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
564 const IV offset = loc - RExC_precomp; \
565 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
566 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
569 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
572 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
575 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
578 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
581 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
584 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
588 /* Allow for side effects in s */
589 #define REGC(c,s) STMT_START { \
590 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
593 /* Macros for recording node offsets. 20001227 mjd@plover.com
594 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
595 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
596 * Element 0 holds the number n.
597 * Position is 1 indexed.
599 #ifndef RE_TRACK_PATTERN_OFFSETS
600 #define Set_Node_Offset_To_R(node,byte)
601 #define Set_Node_Offset(node,byte)
602 #define Set_Cur_Node_Offset
603 #define Set_Node_Length_To_R(node,len)
604 #define Set_Node_Length(node,len)
605 #define Set_Node_Cur_Length(node)
606 #define Node_Offset(n)
607 #define Node_Length(n)
608 #define Set_Node_Offset_Length(node,offset,len)
609 #define ProgLen(ri) ri->u.proglen
610 #define SetProgLen(ri,x) ri->u.proglen = x
612 #define ProgLen(ri) ri->u.offsets[0]
613 #define SetProgLen(ri,x) ri->u.offsets[0] = x
614 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
616 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
617 __LINE__, (int)(node), (int)(byte))); \
619 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
621 RExC_offsets[2*(node)-1] = (byte); \
626 #define Set_Node_Offset(node,byte) \
627 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
628 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
630 #define Set_Node_Length_To_R(node,len) STMT_START { \
632 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
633 __LINE__, (int)(node), (int)(len))); \
635 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
637 RExC_offsets[2*(node)] = (len); \
642 #define Set_Node_Length(node,len) \
643 Set_Node_Length_To_R((node)-RExC_emit_start, len)
644 #define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
645 #define Set_Node_Cur_Length(node) \
646 Set_Node_Length(node, RExC_parse - parse_start)
648 /* Get offsets and lengths */
649 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
650 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
652 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
653 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
654 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
658 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
659 #define EXPERIMENTAL_INPLACESCAN
660 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
662 #define DEBUG_STUDYDATA(str,data,depth) \
663 DEBUG_OPTIMISE_MORE_r(if(data){ \
664 PerlIO_printf(Perl_debug_log, \
665 "%*s" str "Pos:%"IVdf"/%"IVdf \
666 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
667 (int)(depth)*2, "", \
668 (IV)((data)->pos_min), \
669 (IV)((data)->pos_delta), \
670 (UV)((data)->flags), \
671 (IV)((data)->whilem_c), \
672 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
673 is_inf ? "INF " : "" \
675 if ((data)->last_found) \
676 PerlIO_printf(Perl_debug_log, \
677 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
678 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
679 SvPVX_const((data)->last_found), \
680 (IV)((data)->last_end), \
681 (IV)((data)->last_start_min), \
682 (IV)((data)->last_start_max), \
683 ((data)->longest && \
684 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
685 SvPVX_const((data)->longest_fixed), \
686 (IV)((data)->offset_fixed), \
687 ((data)->longest && \
688 (data)->longest==&((data)->longest_float)) ? "*" : "", \
689 SvPVX_const((data)->longest_float), \
690 (IV)((data)->offset_float_min), \
691 (IV)((data)->offset_float_max) \
693 PerlIO_printf(Perl_debug_log,"\n"); \
696 static void clear_re(pTHX_ void *r);
698 /* Mark that we cannot extend a found fixed substring at this point.
699 Update the longest found anchored substring and the longest found
700 floating substrings if needed. */
703 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
705 const STRLEN l = CHR_SVLEN(data->last_found);
706 const STRLEN old_l = CHR_SVLEN(*data->longest);
707 GET_RE_DEBUG_FLAGS_DECL;
709 PERL_ARGS_ASSERT_SCAN_COMMIT;
711 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
712 SvSetMagicSV(*data->longest, data->last_found);
713 if (*data->longest == data->longest_fixed) {
714 data->offset_fixed = l ? data->last_start_min : data->pos_min;
715 if (data->flags & SF_BEFORE_EOL)
717 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
719 data->flags &= ~SF_FIX_BEFORE_EOL;
720 data->minlen_fixed=minlenp;
721 data->lookbehind_fixed=0;
723 else { /* *data->longest == data->longest_float */
724 data->offset_float_min = l ? data->last_start_min : data->pos_min;
725 data->offset_float_max = (l
726 ? data->last_start_max
727 : data->pos_min + data->pos_delta);
728 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
729 data->offset_float_max = I32_MAX;
730 if (data->flags & SF_BEFORE_EOL)
732 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
734 data->flags &= ~SF_FL_BEFORE_EOL;
735 data->minlen_float=minlenp;
736 data->lookbehind_float=0;
739 SvCUR_set(data->last_found, 0);
741 SV * const sv = data->last_found;
742 if (SvUTF8(sv) && SvMAGICAL(sv)) {
743 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
749 data->flags &= ~SF_BEFORE_EOL;
750 DEBUG_STUDYDATA("commit: ",data,0);
753 /* Can match anything (initialization) */
755 S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
757 PERL_ARGS_ASSERT_CL_ANYTHING;
759 ANYOF_BITMAP_SETALL(cl);
760 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
761 |ANYOF_NON_UTF8_LATIN1_ALL;
763 /* If any portion of the regex is to operate under locale rules,
764 * initialization includes it. The reason this isn't done for all regexes
765 * is that the optimizer was written under the assumption that locale was
766 * all-or-nothing. Given the complexity and lack of documentation in the
767 * optimizer, and that there are inadequate test cases for locale, so many
768 * parts of it may not work properly, it is safest to avoid locale unless
770 if (RExC_contains_locale) {
771 ANYOF_CLASS_SETALL(cl); /* /l uses class */
772 cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
775 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
779 /* Can match anything (initialization) */
781 S_cl_is_anything(const struct regnode_charclass_class *cl)
785 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
787 for (value = 0; value <= ANYOF_MAX; value += 2)
788 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
790 if (!(cl->flags & ANYOF_UNICODE_ALL))
792 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
797 /* Can match anything (initialization) */
799 S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
801 PERL_ARGS_ASSERT_CL_INIT;
803 Zero(cl, 1, struct regnode_charclass_class);
805 cl_anything(pRExC_state, cl);
806 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
809 /* These two functions currently do the exact same thing */
810 #define cl_init_zero S_cl_init
812 /* 'AND' a given class with another one. Can create false positives. 'cl'
813 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
814 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
816 S_cl_and(struct regnode_charclass_class *cl,
817 const struct regnode_charclass_class *and_with)
819 PERL_ARGS_ASSERT_CL_AND;
821 assert(and_with->type == ANYOF);
823 /* I (khw) am not sure all these restrictions are necessary XXX */
824 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
825 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
826 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
827 && !(and_with->flags & ANYOF_LOC_FOLD)
828 && !(cl->flags & ANYOF_LOC_FOLD)) {
831 if (and_with->flags & ANYOF_INVERT)
832 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
833 cl->bitmap[i] &= ~and_with->bitmap[i];
835 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
836 cl->bitmap[i] &= and_with->bitmap[i];
837 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
839 if (and_with->flags & ANYOF_INVERT) {
841 /* Here, the and'ed node is inverted. Get the AND of the flags that
842 * aren't affected by the inversion. Those that are affected are
843 * handled individually below */
844 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
845 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
846 cl->flags |= affected_flags;
848 /* We currently don't know how to deal with things that aren't in the
849 * bitmap, but we know that the intersection is no greater than what
850 * is already in cl, so let there be false positives that get sorted
851 * out after the synthetic start class succeeds, and the node is
852 * matched for real. */
854 /* The inversion of these two flags indicate that the resulting
855 * intersection doesn't have them */
856 if (and_with->flags & ANYOF_UNICODE_ALL) {
857 cl->flags &= ~ANYOF_UNICODE_ALL;
859 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
860 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
863 else { /* and'd node is not inverted */
864 U8 outside_bitmap_but_not_utf8; /* Temp variable */
866 if (! ANYOF_NONBITMAP(and_with)) {
868 /* Here 'and_with' doesn't match anything outside the bitmap
869 * (except possibly ANYOF_UNICODE_ALL), which means the
870 * intersection can't either, except for ANYOF_UNICODE_ALL, in
871 * which case we don't know what the intersection is, but it's no
872 * greater than what cl already has, so can just leave it alone,
873 * with possible false positives */
874 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
875 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
876 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
879 else if (! ANYOF_NONBITMAP(cl)) {
881 /* Here, 'and_with' does match something outside the bitmap, and cl
882 * doesn't have a list of things to match outside the bitmap. If
883 * cl can match all code points above 255, the intersection will
884 * be those above-255 code points that 'and_with' matches. If cl
885 * can't match all Unicode code points, it means that it can't
886 * match anything outside the bitmap (since the 'if' that got us
887 * into this block tested for that), so we leave the bitmap empty.
889 if (cl->flags & ANYOF_UNICODE_ALL) {
890 ARG_SET(cl, ARG(and_with));
892 /* and_with's ARG may match things that don't require UTF8.
893 * And now cl's will too, in spite of this being an 'and'. See
894 * the comments below about the kludge */
895 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
899 /* Here, both 'and_with' and cl match something outside the
900 * bitmap. Currently we do not do the intersection, so just match
901 * whatever cl had at the beginning. */
905 /* Take the intersection of the two sets of flags. However, the
906 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
907 * kludge around the fact that this flag is not treated like the others
908 * which are initialized in cl_anything(). The way the optimizer works
909 * is that the synthetic start class (SSC) is initialized to match
910 * anything, and then the first time a real node is encountered, its
911 * values are AND'd with the SSC's with the result being the values of
912 * the real node. However, there are paths through the optimizer where
913 * the AND never gets called, so those initialized bits are set
914 * inappropriately, which is not usually a big deal, as they just cause
915 * false positives in the SSC, which will just mean a probably
916 * imperceptible slow down in execution. However this bit has a
917 * higher false positive consequence in that it can cause utf8.pm,
918 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
919 * bigger slowdown and also causes significant extra memory to be used.
920 * In order to prevent this, the code now takes a different tack. The
921 * bit isn't set unless some part of the regular expression needs it,
922 * but once set it won't get cleared. This means that these extra
923 * modules won't get loaded unless there was some path through the
924 * pattern that would have required them anyway, and so any false
925 * positives that occur by not ANDing them out when they could be
926 * aren't as severe as they would be if we treated this bit like all
928 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
929 & ANYOF_NONBITMAP_NON_UTF8;
930 cl->flags &= and_with->flags;
931 cl->flags |= outside_bitmap_but_not_utf8;
935 /* 'OR' a given class with another one. Can create false positives. 'cl'
936 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
937 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
939 S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
941 PERL_ARGS_ASSERT_CL_OR;
943 if (or_with->flags & ANYOF_INVERT) {
945 /* Here, the or'd node is to be inverted. This means we take the
946 * complement of everything not in the bitmap, but currently we don't
947 * know what that is, so give up and match anything */
948 if (ANYOF_NONBITMAP(or_with)) {
949 cl_anything(pRExC_state, cl);
952 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
953 * <= (B1 | !B2) | (CL1 | !CL2)
954 * which is wasteful if CL2 is small, but we ignore CL2:
955 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
956 * XXXX Can we handle case-fold? Unclear:
957 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
958 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
960 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
961 && !(or_with->flags & ANYOF_LOC_FOLD)
962 && !(cl->flags & ANYOF_LOC_FOLD) ) {
965 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
966 cl->bitmap[i] |= ~or_with->bitmap[i];
967 } /* XXXX: logic is complicated otherwise */
969 cl_anything(pRExC_state, cl);
972 /* And, we can just take the union of the flags that aren't affected
973 * by the inversion */
974 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
976 /* For the remaining flags:
977 ANYOF_UNICODE_ALL and inverted means to not match anything above
978 255, which means that the union with cl should just be
979 what cl has in it, so can ignore this flag
980 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
981 is 127-255 to match them, but then invert that, so the
982 union with cl should just be what cl has in it, so can
985 } else { /* 'or_with' is not inverted */
986 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
987 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
988 && (!(or_with->flags & ANYOF_LOC_FOLD)
989 || (cl->flags & ANYOF_LOC_FOLD)) ) {
992 /* OR char bitmap and class bitmap separately */
993 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
994 cl->bitmap[i] |= or_with->bitmap[i];
995 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
996 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
997 cl->classflags[i] |= or_with->classflags[i];
998 cl->flags |= ANYOF_CLASS;
1001 else { /* XXXX: logic is complicated, leave it along for a moment. */
1002 cl_anything(pRExC_state, cl);
1005 if (ANYOF_NONBITMAP(or_with)) {
1007 /* Use the added node's outside-the-bit-map match if there isn't a
1008 * conflict. If there is a conflict (both nodes match something
1009 * outside the bitmap, but what they match outside is not the same
1010 * pointer, and hence not easily compared until XXX we extend
1011 * inversion lists this far), give up and allow the start class to
1012 * match everything outside the bitmap. If that stuff is all above
1013 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
1014 if (! ANYOF_NONBITMAP(cl)) {
1015 ARG_SET(cl, ARG(or_with));
1017 else if (ARG(cl) != ARG(or_with)) {
1019 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
1020 cl_anything(pRExC_state, cl);
1023 cl->flags |= ANYOF_UNICODE_ALL;
1028 /* Take the union */
1029 cl->flags |= or_with->flags;
1033 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1034 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1035 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1036 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1041 dump_trie(trie,widecharmap,revcharmap)
1042 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1043 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1045 These routines dump out a trie in a somewhat readable format.
1046 The _interim_ variants are used for debugging the interim
1047 tables that are used to generate the final compressed
1048 representation which is what dump_trie expects.
1050 Part of the reason for their existence is to provide a form
1051 of documentation as to how the different representations function.
1056 Dumps the final compressed table form of the trie to Perl_debug_log.
1057 Used for debugging make_trie().
1061 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1062 AV *revcharmap, U32 depth)
1065 SV *sv=sv_newmortal();
1066 int colwidth= widecharmap ? 6 : 4;
1068 GET_RE_DEBUG_FLAGS_DECL;
1070 PERL_ARGS_ASSERT_DUMP_TRIE;
1072 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1073 (int)depth * 2 + 2,"",
1074 "Match","Base","Ofs" );
1076 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1077 SV ** const tmp = av_fetch( revcharmap, state, 0);
1079 PerlIO_printf( Perl_debug_log, "%*s",
1081 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1082 PL_colors[0], PL_colors[1],
1083 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1084 PERL_PV_ESCAPE_FIRSTCHAR
1089 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1090 (int)depth * 2 + 2,"");
1092 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1093 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1094 PerlIO_printf( Perl_debug_log, "\n");
1096 for( state = 1 ; state < trie->statecount ; state++ ) {
1097 const U32 base = trie->states[ state ].trans.base;
1099 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1101 if ( trie->states[ state ].wordnum ) {
1102 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1104 PerlIO_printf( Perl_debug_log, "%6s", "" );
1107 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1112 while( ( base + ofs < trie->uniquecharcount ) ||
1113 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1114 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1117 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1119 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1120 if ( ( base + ofs >= trie->uniquecharcount ) &&
1121 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1122 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1124 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1126 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1128 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1132 PerlIO_printf( Perl_debug_log, "]");
1135 PerlIO_printf( Perl_debug_log, "\n" );
1137 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1138 for (word=1; word <= trie->wordcount; word++) {
1139 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1140 (int)word, (int)(trie->wordinfo[word].prev),
1141 (int)(trie->wordinfo[word].len));
1143 PerlIO_printf(Perl_debug_log, "\n" );
1146 Dumps a fully constructed but uncompressed trie in list form.
1147 List tries normally only are used for construction when the number of
1148 possible chars (trie->uniquecharcount) is very high.
1149 Used for debugging make_trie().
1152 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1153 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1157 SV *sv=sv_newmortal();
1158 int colwidth= widecharmap ? 6 : 4;
1159 GET_RE_DEBUG_FLAGS_DECL;
1161 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1163 /* print out the table precompression. */
1164 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1165 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1166 "------:-----+-----------------\n" );
1168 for( state=1 ; state < next_alloc ; state ++ ) {
1171 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1172 (int)depth * 2 + 2,"", (UV)state );
1173 if ( ! trie->states[ state ].wordnum ) {
1174 PerlIO_printf( Perl_debug_log, "%5s| ","");
1176 PerlIO_printf( Perl_debug_log, "W%4x| ",
1177 trie->states[ state ].wordnum
1180 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1181 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
1183 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1185 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1186 PL_colors[0], PL_colors[1],
1187 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1188 PERL_PV_ESCAPE_FIRSTCHAR
1190 TRIE_LIST_ITEM(state,charid).forid,
1191 (UV)TRIE_LIST_ITEM(state,charid).newstate
1194 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1195 (int)((depth * 2) + 14), "");
1198 PerlIO_printf( Perl_debug_log, "\n");
1203 Dumps a fully constructed but uncompressed trie in table form.
1204 This is the normal DFA style state transition table, with a few
1205 twists to facilitate compression later.
1206 Used for debugging make_trie().
1209 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1210 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1215 SV *sv=sv_newmortal();
1216 int colwidth= widecharmap ? 6 : 4;
1217 GET_RE_DEBUG_FLAGS_DECL;
1219 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1222 print out the table precompression so that we can do a visual check
1223 that they are identical.
1226 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1228 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1229 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1231 PerlIO_printf( Perl_debug_log, "%*s",
1233 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1234 PL_colors[0], PL_colors[1],
1235 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1236 PERL_PV_ESCAPE_FIRSTCHAR
1242 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1244 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1245 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1248 PerlIO_printf( Perl_debug_log, "\n" );
1250 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1252 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1253 (int)depth * 2 + 2,"",
1254 (UV)TRIE_NODENUM( state ) );
1256 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1257 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1259 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1261 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1263 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1264 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1266 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1267 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1275 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1276 startbranch: the first branch in the whole branch sequence
1277 first : start branch of sequence of branch-exact nodes.
1278 May be the same as startbranch
1279 last : Thing following the last branch.
1280 May be the same as tail.
1281 tail : item following the branch sequence
1282 count : words in the sequence
1283 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1284 depth : indent depth
1286 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1288 A trie is an N'ary tree where the branches are determined by digital
1289 decomposition of the key. IE, at the root node you look up the 1st character and
1290 follow that branch repeat until you find the end of the branches. Nodes can be
1291 marked as "accepting" meaning they represent a complete word. Eg:
1295 would convert into the following structure. Numbers represent states, letters
1296 following numbers represent valid transitions on the letter from that state, if
1297 the number is in square brackets it represents an accepting state, otherwise it
1298 will be in parenthesis.
1300 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1304 (1) +-i->(6)-+-s->[7]
1306 +-s->(3)-+-h->(4)-+-e->[5]
1308 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1310 This shows that when matching against the string 'hers' we will begin at state 1
1311 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1312 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1313 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1314 single traverse. We store a mapping from accepting to state to which word was
1315 matched, and then when we have multiple possibilities we try to complete the
1316 rest of the regex in the order in which they occured in the alternation.
1318 The only prior NFA like behaviour that would be changed by the TRIE support is
1319 the silent ignoring of duplicate alternations which are of the form:
1321 / (DUPE|DUPE) X? (?{ ... }) Y /x
1323 Thus EVAL blocks following a trie may be called a different number of times with
1324 and without the optimisation. With the optimisations dupes will be silently
1325 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1326 the following demonstrates:
1328 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1330 which prints out 'word' three times, but
1332 'words'=~/(word|word|word)(?{ print $1 })S/
1334 which doesnt print it out at all. This is due to other optimisations kicking in.
1336 Example of what happens on a structural level:
1338 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1340 1: CURLYM[1] {1,32767}(18)
1351 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1352 and should turn into:
1354 1: CURLYM[1] {1,32767}(18)
1356 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1364 Cases where tail != last would be like /(?foo|bar)baz/:
1374 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1375 and would end up looking like:
1378 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1385 d = uvuni_to_utf8_flags(d, uv, 0);
1387 is the recommended Unicode-aware way of saying
1392 #define TRIE_STORE_REVCHAR(val) \
1395 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1396 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1397 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
1398 SvCUR_set(zlopp, kapow - flrbbbbb); \
1401 av_push(revcharmap, zlopp); \
1403 char ooooff = (char)val; \
1404 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1408 #define TRIE_READ_CHAR STMT_START { \
1411 /* if it is UTF then it is either already folded, or does not need folding */ \
1412 uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
1414 else if (folder == PL_fold_latin1) { \
1415 /* if we use this folder we have to obey unicode rules on latin-1 data */ \
1416 if ( foldlen > 0 ) { \
1417 uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
1423 uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
1424 skiplen = UNISKIP(uvc); \
1425 foldlen -= skiplen; \
1426 scan = foldbuf + skiplen; \
1429 /* raw data, will be folded later if needed */ \
1437 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1438 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1439 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1440 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1442 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1443 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1444 TRIE_LIST_CUR( state )++; \
1447 #define TRIE_LIST_NEW(state) STMT_START { \
1448 Newxz( trie->states[ state ].trans.list, \
1449 4, reg_trie_trans_le ); \
1450 TRIE_LIST_CUR( state ) = 1; \
1451 TRIE_LIST_LEN( state ) = 4; \
1454 #define TRIE_HANDLE_WORD(state) STMT_START { \
1455 U16 dupe= trie->states[ state ].wordnum; \
1456 regnode * const noper_next = regnext( noper ); \
1459 /* store the word for dumping */ \
1461 if (OP(noper) != NOTHING) \
1462 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1464 tmp = newSVpvn_utf8( "", 0, UTF ); \
1465 av_push( trie_words, tmp ); \
1469 trie->wordinfo[curword].prev = 0; \
1470 trie->wordinfo[curword].len = wordlen; \
1471 trie->wordinfo[curword].accept = state; \
1473 if ( noper_next < tail ) { \
1475 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
1476 trie->jump[curword] = (U16)(noper_next - convert); \
1478 jumper = noper_next; \
1480 nextbranch= regnext(cur); \
1484 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1485 /* chain, so that when the bits of chain are later */\
1486 /* linked together, the dups appear in the chain */\
1487 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1488 trie->wordinfo[dupe].prev = curword; \
1490 /* we haven't inserted this word yet. */ \
1491 trie->states[ state ].wordnum = curword; \
1496 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1497 ( ( base + charid >= ucharcount \
1498 && base + charid < ubound \
1499 && state == trie->trans[ base - ucharcount + charid ].check \
1500 && trie->trans[ base - ucharcount + charid ].next ) \
1501 ? trie->trans[ base - ucharcount + charid ].next \
1502 : ( state==1 ? special : 0 ) \
1506 #define MADE_JUMP_TRIE 2
1507 #define MADE_EXACT_TRIE 4
1510 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
1513 /* first pass, loop through and scan words */
1514 reg_trie_data *trie;
1515 HV *widecharmap = NULL;
1516 AV *revcharmap = newAV();
1518 const U32 uniflags = UTF8_ALLOW_DEFAULT;
1523 regnode *jumper = NULL;
1524 regnode *nextbranch = NULL;
1525 regnode *convert = NULL;
1526 U32 *prev_states; /* temp array mapping each state to previous one */
1527 /* we just use folder as a flag in utf8 */
1528 const U8 * folder = NULL;
1531 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1532 AV *trie_words = NULL;
1533 /* along with revcharmap, this only used during construction but both are
1534 * useful during debugging so we store them in the struct when debugging.
1537 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
1538 STRLEN trie_charcount=0;
1540 SV *re_trie_maxbuff;
1541 GET_RE_DEBUG_FLAGS_DECL;
1543 PERL_ARGS_ASSERT_MAKE_TRIE;
1545 PERL_UNUSED_ARG(depth);
1552 case EXACTFU_TRICKYFOLD:
1553 case EXACTFU: folder = PL_fold_latin1; break;
1554 case EXACTF: folder = PL_fold; break;
1555 case EXACTFL: folder = PL_fold_locale; break;
1556 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1559 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1561 trie->startstate = 1;
1562 trie->wordcount = word_count;
1563 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1564 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
1566 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
1567 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1568 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1571 trie_words = newAV();
1574 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1575 if (!SvIOK(re_trie_maxbuff)) {
1576 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1578 DEBUG_TRIE_COMPILE_r({
1579 PerlIO_printf( Perl_debug_log,
1580 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
1581 (int)depth * 2 + 2, "",
1582 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
1583 REG_NODE_NUM(last), REG_NODE_NUM(tail),
1587 /* Find the node we are going to overwrite */
1588 if ( first == startbranch && OP( last ) != BRANCH ) {
1589 /* whole branch chain */
1592 /* branch sub-chain */
1593 convert = NEXTOPER( first );
1596 /* -- First loop and Setup --
1598 We first traverse the branches and scan each word to determine if it
1599 contains widechars, and how many unique chars there are, this is
1600 important as we have to build a table with at least as many columns as we
1603 We use an array of integers to represent the character codes 0..255
1604 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
1605 native representation of the character value as the key and IV's for the
1608 *TODO* If we keep track of how many times each character is used we can
1609 remap the columns so that the table compression later on is more
1610 efficient in terms of memory by ensuring the most common value is in the
1611 middle and the least common are on the outside. IMO this would be better
1612 than a most to least common mapping as theres a decent chance the most
1613 common letter will share a node with the least common, meaning the node
1614 will not be compressible. With a middle is most common approach the worst
1615 case is when we have the least common nodes twice.
1619 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1620 regnode *noper = NEXTOPER( cur );
1621 const U8 *uc = (U8*)STRING( noper );
1622 const U8 *e = uc + STR_LEN( noper );
1624 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1626 const U8 *scan = (U8*)NULL;
1627 U32 wordlen = 0; /* required init */
1629 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
1631 if (OP(noper) == NOTHING) {
1632 regnode *noper_next= regnext(noper);
1633 if (noper_next != tail && OP(noper_next) == flags) {
1635 uc= (U8*)STRING(noper);
1636 e= uc + STR_LEN(noper);
1637 trie->minlen= STR_LEN(noper);
1644 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
1645 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1646 regardless of encoding */
1647 if (OP( noper ) == EXACTFU_SS) {
1648 /* false positives are ok, so just set this */
1649 TRIE_BITMAP_SET(trie,0xDF);
1652 for ( ; uc < e ; uc += len ) {
1653 TRIE_CHARCOUNT(trie)++;
1658 U8 folded= folder[ (U8) uvc ];
1659 if ( !trie->charmap[ folded ] ) {
1660 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1661 TRIE_STORE_REVCHAR( folded );
1664 if ( !trie->charmap[ uvc ] ) {
1665 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1666 TRIE_STORE_REVCHAR( uvc );
1669 /* store the codepoint in the bitmap, and its folded
1671 TRIE_BITMAP_SET(trie, uvc);
1673 /* store the folded codepoint */
1674 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
1677 /* store first byte of utf8 representation of
1678 variant codepoints */
1679 if (! UNI_IS_INVARIANT(uvc)) {
1680 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
1683 set_bit = 0; /* We've done our bit :-) */
1688 widecharmap = newHV();
1690 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
1693 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
1695 if ( !SvTRUE( *svpp ) ) {
1696 sv_setiv( *svpp, ++trie->uniquecharcount );
1697 TRIE_STORE_REVCHAR(uvc);
1701 if( cur == first ) {
1702 trie->minlen = chars;
1703 trie->maxlen = chars;
1704 } else if (chars < trie->minlen) {
1705 trie->minlen = chars;
1706 } else if (chars > trie->maxlen) {
1707 trie->maxlen = chars;
1709 if (OP( noper ) == EXACTFU_SS) {
1710 /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
1711 if (trie->minlen > 1)
1714 if (OP( noper ) == EXACTFU_TRICKYFOLD) {
1715 /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
1716 * - We assume that any such sequence might match a 2 byte string */
1717 if (trie->minlen > 2 )
1721 } /* end first pass */
1722 DEBUG_TRIE_COMPILE_r(
1723 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1724 (int)depth * 2 + 2,"",
1725 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
1726 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1727 (int)trie->minlen, (int)trie->maxlen )
1731 We now know what we are dealing with in terms of unique chars and
1732 string sizes so we can calculate how much memory a naive
1733 representation using a flat table will take. If it's over a reasonable
1734 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
1735 conservative but potentially much slower representation using an array
1738 At the end we convert both representations into the same compressed
1739 form that will be used in regexec.c for matching with. The latter
1740 is a form that cannot be used to construct with but has memory
1741 properties similar to the list form and access properties similar
1742 to the table form making it both suitable for fast searches and
1743 small enough that its feasable to store for the duration of a program.
1745 See the comment in the code where the compressed table is produced
1746 inplace from the flat tabe representation for an explanation of how
1747 the compression works.
1752 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1755 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
1757 Second Pass -- Array Of Lists Representation
1759 Each state will be represented by a list of charid:state records
1760 (reg_trie_trans_le) the first such element holds the CUR and LEN
1761 points of the allocated array. (See defines above).
1763 We build the initial structure using the lists, and then convert
1764 it into the compressed table form which allows faster lookups
1765 (but cant be modified once converted).
1768 STRLEN transcount = 1;
1770 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1771 "%*sCompiling trie using list compiler\n",
1772 (int)depth * 2 + 2, ""));
1774 trie->states = (reg_trie_state *)
1775 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1776 sizeof(reg_trie_state) );
1780 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1782 regnode *noper = NEXTOPER( cur );
1783 U8 *uc = (U8*)STRING( noper );
1784 const U8 *e = uc + STR_LEN( noper );
1785 U32 state = 1; /* required init */
1786 U16 charid = 0; /* sanity init */
1787 U8 *scan = (U8*)NULL; /* sanity init */
1788 STRLEN foldlen = 0; /* required init */
1789 U32 wordlen = 0; /* required init */
1790 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1793 if (OP(noper) == NOTHING) {
1794 regnode *noper_next= regnext(noper);
1795 if (noper_next != tail && OP(noper_next) == flags) {
1797 uc= (U8*)STRING(noper);
1798 e= uc + STR_LEN(noper);
1802 if (OP(noper) != NOTHING) {
1803 for ( ; uc < e ; uc += len ) {
1808 charid = trie->charmap[ uvc ];
1810 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
1814 charid=(U16)SvIV( *svpp );
1817 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1824 if ( !trie->states[ state ].trans.list ) {
1825 TRIE_LIST_NEW( state );
1827 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1828 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1829 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1834 newstate = next_alloc++;
1835 prev_states[newstate] = state;
1836 TRIE_LIST_PUSH( state, charid, newstate );
1841 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1845 TRIE_HANDLE_WORD(state);
1847 } /* end second pass */
1849 /* next alloc is the NEXT state to be allocated */
1850 trie->statecount = next_alloc;
1851 trie->states = (reg_trie_state *)
1852 PerlMemShared_realloc( trie->states,
1854 * sizeof(reg_trie_state) );
1856 /* and now dump it out before we compress it */
1857 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1858 revcharmap, next_alloc,
1862 trie->trans = (reg_trie_trans *)
1863 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
1870 for( state=1 ; state < next_alloc ; state ++ ) {
1874 DEBUG_TRIE_COMPILE_MORE_r(
1875 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1879 if (trie->states[state].trans.list) {
1880 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1884 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1885 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1886 if ( forid < minid ) {
1888 } else if ( forid > maxid ) {
1892 if ( transcount < tp + maxid - minid + 1) {
1894 trie->trans = (reg_trie_trans *)
1895 PerlMemShared_realloc( trie->trans,
1897 * sizeof(reg_trie_trans) );
1898 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1900 base = trie->uniquecharcount + tp - minid;
1901 if ( maxid == minid ) {
1903 for ( ; zp < tp ; zp++ ) {
1904 if ( ! trie->trans[ zp ].next ) {
1905 base = trie->uniquecharcount + zp - minid;
1906 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1907 trie->trans[ zp ].check = state;
1913 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1914 trie->trans[ tp ].check = state;
1919 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
1920 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
1921 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1922 trie->trans[ tid ].check = state;
1924 tp += ( maxid - minid + 1 );
1926 Safefree(trie->states[ state ].trans.list);
1929 DEBUG_TRIE_COMPILE_MORE_r(
1930 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1933 trie->states[ state ].trans.base=base;
1935 trie->lasttrans = tp + 1;
1939 Second Pass -- Flat Table Representation.
1941 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1942 We know that we will need Charcount+1 trans at most to store the data
1943 (one row per char at worst case) So we preallocate both structures
1944 assuming worst case.
1946 We then construct the trie using only the .next slots of the entry
1949 We use the .check field of the first entry of the node temporarily to
1950 make compression both faster and easier by keeping track of how many non
1951 zero fields are in the node.
1953 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1956 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1957 number representing the first entry of the node, and state as a
1958 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1959 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1960 are 2 entrys per node. eg:
1968 The table is internally in the right hand, idx form. However as we also
1969 have to deal with the states array which is indexed by nodenum we have to
1970 use TRIE_NODENUM() to convert.
1973 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1974 "%*sCompiling trie using table compiler\n",
1975 (int)depth * 2 + 2, ""));
1977 trie->trans = (reg_trie_trans *)
1978 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1979 * trie->uniquecharcount + 1,
1980 sizeof(reg_trie_trans) );
1981 trie->states = (reg_trie_state *)
1982 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1983 sizeof(reg_trie_state) );
1984 next_alloc = trie->uniquecharcount + 1;
1987 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1989 regnode *noper = NEXTOPER( cur );
1990 const U8 *uc = (U8*)STRING( noper );
1991 const U8 *e = uc + STR_LEN( noper );
1993 U32 state = 1; /* required init */
1995 U16 charid = 0; /* sanity init */
1996 U32 accept_state = 0; /* sanity init */
1997 U8 *scan = (U8*)NULL; /* sanity init */
1999 STRLEN foldlen = 0; /* required init */
2000 U32 wordlen = 0; /* required init */
2002 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2004 if (OP(noper) == NOTHING) {
2005 regnode *noper_next= regnext(noper);
2006 if (noper_next != tail && OP(noper_next) == flags) {
2008 uc= (U8*)STRING(noper);
2009 e= uc + STR_LEN(noper);
2013 if ( OP(noper) != NOTHING ) {
2014 for ( ; uc < e ; uc += len ) {
2019 charid = trie->charmap[ uvc ];
2021 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
2022 charid = svpp ? (U16)SvIV(*svpp) : 0;
2026 if ( !trie->trans[ state + charid ].next ) {
2027 trie->trans[ state + charid ].next = next_alloc;
2028 trie->trans[ state ].check++;
2029 prev_states[TRIE_NODENUM(next_alloc)]
2030 = TRIE_NODENUM(state);
2031 next_alloc += trie->uniquecharcount;
2033 state = trie->trans[ state + charid ].next;
2035 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2037 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2040 accept_state = TRIE_NODENUM( state );
2041 TRIE_HANDLE_WORD(accept_state);
2043 } /* end second pass */
2045 /* and now dump it out before we compress it */
2046 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2048 next_alloc, depth+1));
2052 * Inplace compress the table.*
2054 For sparse data sets the table constructed by the trie algorithm will
2055 be mostly 0/FAIL transitions or to put it another way mostly empty.
2056 (Note that leaf nodes will not contain any transitions.)
2058 This algorithm compresses the tables by eliminating most such
2059 transitions, at the cost of a modest bit of extra work during lookup:
2061 - Each states[] entry contains a .base field which indicates the
2062 index in the state[] array wheres its transition data is stored.
2064 - If .base is 0 there are no valid transitions from that node.
2066 - If .base is nonzero then charid is added to it to find an entry in
2069 -If trans[states[state].base+charid].check!=state then the
2070 transition is taken to be a 0/Fail transition. Thus if there are fail
2071 transitions at the front of the node then the .base offset will point
2072 somewhere inside the previous nodes data (or maybe even into a node
2073 even earlier), but the .check field determines if the transition is
2077 The following process inplace converts the table to the compressed
2078 table: We first do not compress the root node 1,and mark all its
2079 .check pointers as 1 and set its .base pointer as 1 as well. This
2080 allows us to do a DFA construction from the compressed table later,
2081 and ensures that any .base pointers we calculate later are greater
2084 - We set 'pos' to indicate the first entry of the second node.
2086 - We then iterate over the columns of the node, finding the first and
2087 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2088 and set the .check pointers accordingly, and advance pos
2089 appropriately and repreat for the next node. Note that when we copy
2090 the next pointers we have to convert them from the original
2091 NODEIDX form to NODENUM form as the former is not valid post
2094 - If a node has no transitions used we mark its base as 0 and do not
2095 advance the pos pointer.
2097 - If a node only has one transition we use a second pointer into the
2098 structure to fill in allocated fail transitions from other states.
2099 This pointer is independent of the main pointer and scans forward
2100 looking for null transitions that are allocated to a state. When it
2101 finds one it writes the single transition into the "hole". If the
2102 pointer doesnt find one the single transition is appended as normal.
2104 - Once compressed we can Renew/realloc the structures to release the
2107 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2108 specifically Fig 3.47 and the associated pseudocode.
2112 const U32 laststate = TRIE_NODENUM( next_alloc );
2115 trie->statecount = laststate;
2117 for ( state = 1 ; state < laststate ; state++ ) {
2119 const U32 stateidx = TRIE_NODEIDX( state );
2120 const U32 o_used = trie->trans[ stateidx ].check;
2121 U32 used = trie->trans[ stateidx ].check;
2122 trie->trans[ stateidx ].check = 0;
2124 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2125 if ( flag || trie->trans[ stateidx + charid ].next ) {
2126 if ( trie->trans[ stateidx + charid ].next ) {
2128 for ( ; zp < pos ; zp++ ) {
2129 if ( ! trie->trans[ zp ].next ) {
2133 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2134 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2135 trie->trans[ zp ].check = state;
2136 if ( ++zp > pos ) pos = zp;
2143 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2145 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2146 trie->trans[ pos ].check = state;
2151 trie->lasttrans = pos + 1;
2152 trie->states = (reg_trie_state *)
2153 PerlMemShared_realloc( trie->states, laststate
2154 * sizeof(reg_trie_state) );
2155 DEBUG_TRIE_COMPILE_MORE_r(
2156 PerlIO_printf( Perl_debug_log,
2157 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2158 (int)depth * 2 + 2,"",
2159 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
2162 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2165 } /* end table compress */
2167 DEBUG_TRIE_COMPILE_MORE_r(
2168 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2169 (int)depth * 2 + 2, "",
2170 (UV)trie->statecount,
2171 (UV)trie->lasttrans)
2173 /* resize the trans array to remove unused space */
2174 trie->trans = (reg_trie_trans *)
2175 PerlMemShared_realloc( trie->trans, trie->lasttrans
2176 * sizeof(reg_trie_trans) );
2178 { /* Modify the program and insert the new TRIE node */
2179 U8 nodetype =(U8)(flags & 0xFF);
2183 regnode *optimize = NULL;
2184 #ifdef RE_TRACK_PATTERN_OFFSETS
2187 U32 mjd_nodelen = 0;
2188 #endif /* RE_TRACK_PATTERN_OFFSETS */
2189 #endif /* DEBUGGING */
2191 This means we convert either the first branch or the first Exact,
2192 depending on whether the thing following (in 'last') is a branch
2193 or not and whther first is the startbranch (ie is it a sub part of
2194 the alternation or is it the whole thing.)
2195 Assuming its a sub part we convert the EXACT otherwise we convert
2196 the whole branch sequence, including the first.
2198 /* Find the node we are going to overwrite */
2199 if ( first != startbranch || OP( last ) == BRANCH ) {
2200 /* branch sub-chain */
2201 NEXT_OFF( first ) = (U16)(last - first);
2202 #ifdef RE_TRACK_PATTERN_OFFSETS
2204 mjd_offset= Node_Offset((convert));
2205 mjd_nodelen= Node_Length((convert));
2208 /* whole branch chain */
2210 #ifdef RE_TRACK_PATTERN_OFFSETS
2213 const regnode *nop = NEXTOPER( convert );
2214 mjd_offset= Node_Offset((nop));
2215 mjd_nodelen= Node_Length((nop));
2219 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2220 (int)depth * 2 + 2, "",
2221 (UV)mjd_offset, (UV)mjd_nodelen)
2224 /* But first we check to see if there is a common prefix we can
2225 split out as an EXACT and put in front of the TRIE node. */
2226 trie->startstate= 1;
2227 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2229 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2233 const U32 base = trie->states[ state ].trans.base;
2235 if ( trie->states[state].wordnum )
2238 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2239 if ( ( base + ofs >= trie->uniquecharcount ) &&
2240 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2241 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2243 if ( ++count > 1 ) {
2244 SV **tmp = av_fetch( revcharmap, ofs, 0);
2245 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2246 if ( state == 1 ) break;
2248 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2250 PerlIO_printf(Perl_debug_log,
2251 "%*sNew Start State=%"UVuf" Class: [",
2252 (int)depth * 2 + 2, "",
2255 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2256 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2258 TRIE_BITMAP_SET(trie,*ch);
2260 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2262 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2266 TRIE_BITMAP_SET(trie,*ch);
2268 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2269 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2275 SV **tmp = av_fetch( revcharmap, idx, 0);
2277 char *ch = SvPV( *tmp, len );
2279 SV *sv=sv_newmortal();
2280 PerlIO_printf( Perl_debug_log,
2281 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2282 (int)depth * 2 + 2, "",
2284 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2285 PL_colors[0], PL_colors[1],
2286 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2287 PERL_PV_ESCAPE_FIRSTCHAR
2292 OP( convert ) = nodetype;
2293 str=STRING(convert);
2296 STR_LEN(convert) += len;
2302 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2307 trie->prefixlen = (state-1);
2309 regnode *n = convert+NODE_SZ_STR(convert);
2310 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2311 trie->startstate = state;
2312 trie->minlen -= (state - 1);
2313 trie->maxlen -= (state - 1);
2315 /* At least the UNICOS C compiler choked on this
2316 * being argument to DEBUG_r(), so let's just have
2319 #ifdef PERL_EXT_RE_BUILD
2325 regnode *fix = convert;
2326 U32 word = trie->wordcount;
2328 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2329 while( ++fix < n ) {
2330 Set_Node_Offset_Length(fix, 0, 0);
2333 SV ** const tmp = av_fetch( trie_words, word, 0 );
2335 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2336 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2338 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2346 NEXT_OFF(convert) = (U16)(tail - convert);
2347 DEBUG_r(optimize= n);
2353 if ( trie->maxlen ) {
2354 NEXT_OFF( convert ) = (U16)(tail - convert);
2355 ARG_SET( convert, data_slot );
2356 /* Store the offset to the first unabsorbed branch in
2357 jump[0], which is otherwise unused by the jump logic.
2358 We use this when dumping a trie and during optimisation. */
2360 trie->jump[0] = (U16)(nextbranch - convert);
2362 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2363 * and there is a bitmap
2364 * and the first "jump target" node we found leaves enough room
2365 * then convert the TRIE node into a TRIEC node, with the bitmap
2366 * embedded inline in the opcode - this is hypothetically faster.
2368 if ( !trie->states[trie->startstate].wordnum
2370 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2372 OP( convert ) = TRIEC;
2373 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2374 PerlMemShared_free(trie->bitmap);
2377 OP( convert ) = TRIE;
2379 /* store the type in the flags */
2380 convert->flags = nodetype;
2384 + regarglen[ OP( convert ) ];
2386 /* XXX We really should free up the resource in trie now,
2387 as we won't use them - (which resources?) dmq */
2389 /* needed for dumping*/
2390 DEBUG_r(if (optimize) {
2391 regnode *opt = convert;
2393 while ( ++opt < optimize) {
2394 Set_Node_Offset_Length(opt,0,0);
2397 Try to clean up some of the debris left after the
2400 while( optimize < jumper ) {
2401 mjd_nodelen += Node_Length((optimize));
2402 OP( optimize ) = OPTIMIZED;
2403 Set_Node_Offset_Length(optimize,0,0);
2406 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2408 } /* end node insert */
2410 /* Finish populating the prev field of the wordinfo array. Walk back
2411 * from each accept state until we find another accept state, and if
2412 * so, point the first word's .prev field at the second word. If the
2413 * second already has a .prev field set, stop now. This will be the
2414 * case either if we've already processed that word's accept state,
2415 * or that state had multiple words, and the overspill words were
2416 * already linked up earlier.
2423 for (word=1; word <= trie->wordcount; word++) {
2425 if (trie->wordinfo[word].prev)
2427 state = trie->wordinfo[word].accept;
2429 state = prev_states[state];
2432 prev = trie->states[state].wordnum;
2436 trie->wordinfo[word].prev = prev;
2438 Safefree(prev_states);
2442 /* and now dump out the compressed format */
2443 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2445 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2447 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2448 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2450 SvREFCNT_dec(revcharmap);
2454 : trie->startstate>1
2460 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2462 /* The Trie is constructed and compressed now so we can build a fail array if it's needed
2464 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2465 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2468 We find the fail state for each state in the trie, this state is the longest proper
2469 suffix of the current state's 'word' that is also a proper prefix of another word in our
2470 trie. State 1 represents the word '' and is thus the default fail state. This allows
2471 the DFA not to have to restart after its tried and failed a word at a given point, it
2472 simply continues as though it had been matching the other word in the first place.
2474 'abcdgu'=~/abcdefg|cdgu/
2475 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2476 fail, which would bring us to the state representing 'd' in the second word where we would
2477 try 'g' and succeed, proceeding to match 'cdgu'.
2479 /* add a fail transition */
2480 const U32 trie_offset = ARG(source);
2481 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
2483 const U32 ucharcount = trie->uniquecharcount;
2484 const U32 numstates = trie->statecount;
2485 const U32 ubound = trie->lasttrans + ucharcount;
2489 U32 base = trie->states[ 1 ].trans.base;
2492 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2493 GET_RE_DEBUG_FLAGS_DECL;
2495 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
2497 PERL_UNUSED_ARG(depth);
2501 ARG_SET( stclass, data_slot );
2502 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
2503 RExC_rxi->data->data[ data_slot ] = (void*)aho;
2504 aho->trie=trie_offset;
2505 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2506 Copy( trie->states, aho->states, numstates, reg_trie_state );
2507 Newxz( q, numstates, U32);
2508 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
2511 /* initialize fail[0..1] to be 1 so that we always have
2512 a valid final fail state */
2513 fail[ 0 ] = fail[ 1 ] = 1;
2515 for ( charid = 0; charid < ucharcount ; charid++ ) {
2516 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2518 q[ q_write ] = newstate;
2519 /* set to point at the root */
2520 fail[ q[ q_write++ ] ]=1;
2523 while ( q_read < q_write) {
2524 const U32 cur = q[ q_read++ % numstates ];
2525 base = trie->states[ cur ].trans.base;
2527 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2528 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2530 U32 fail_state = cur;
2533 fail_state = fail[ fail_state ];
2534 fail_base = aho->states[ fail_state ].trans.base;
2535 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2537 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2538 fail[ ch_state ] = fail_state;
2539 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2541 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2543 q[ q_write++ % numstates] = ch_state;
2547 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2548 when we fail in state 1, this allows us to use the
2549 charclass scan to find a valid start char. This is based on the principle
2550 that theres a good chance the string being searched contains lots of stuff
2551 that cant be a start char.
2553 fail[ 0 ] = fail[ 1 ] = 0;
2554 DEBUG_TRIE_COMPILE_r({
2555 PerlIO_printf(Perl_debug_log,
2556 "%*sStclass Failtable (%"UVuf" states): 0",
2557 (int)(depth * 2), "", (UV)numstates
2559 for( q_read=1; q_read<numstates; q_read++ ) {
2560 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2562 PerlIO_printf(Perl_debug_log, "\n");
2565 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
2570 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2571 * These need to be revisited when a newer toolchain becomes available.
2573 #if defined(__sparc64__) && defined(__GNUC__)
2574 # if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2575 # undef SPARC64_GCC_WORKAROUND
2576 # define SPARC64_GCC_WORKAROUND 1
2580 #define DEBUG_PEEP(str,scan,depth) \
2581 DEBUG_OPTIMISE_r({if (scan){ \
2582 SV * const mysv=sv_newmortal(); \
2583 regnode *Next = regnext(scan); \
2584 regprop(RExC_rx, mysv, scan); \
2585 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
2586 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2587 Next ? (REG_NODE_NUM(Next)) : 0 ); \
2591 /* The below joins as many adjacent EXACTish nodes as possible into a single
2592 * one. The regop may be changed if the node(s) contain certain sequences that
2593 * require special handling. The joining is only done if:
2594 * 1) there is room in the current conglomerated node to entirely contain the
2596 * 2) they are the exact same node type
2598 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
2599 * these get optimized out
2601 * If a node is to match under /i (folded), the number of characters it matches
2602 * can be different than its character length if it contains a multi-character
2603 * fold. *min_subtract is set to the total delta of the input nodes.
2605 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2606 * and contains LATIN SMALL LETTER SHARP S
2608 * This is as good a place as any to discuss the design of handling these
2609 * multi-character fold sequences. It's been wrong in Perl for a very long
2610 * time. There are three code points in Unicode whose multi-character folds
2611 * were long ago discovered to mess things up. The previous designs for
2612 * dealing with these involved assigning a special node for them. This
2613 * approach doesn't work, as evidenced by this example:
2614 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
2615 * Both these fold to "sss", but if the pattern is parsed to create a node that
2616 * would match just the \xDF, it won't be able to handle the case where a
2617 * successful match would have to cross the node's boundary. The new approach
2618 * that hopefully generally solves the problem generates an EXACTFU_SS node
2621 * It turns out that there are problems with all multi-character folds, and not
2622 * just these three. Now the code is general, for all such cases, but the
2623 * three still have some special handling. The approach taken is:
2624 * 1) This routine examines each EXACTFish node that could contain multi-
2625 * character fold sequences. It returns in *min_subtract how much to
2626 * subtract from the the actual length of the string to get a real minimum
2627 * match length; it is 0 if there are no multi-char folds. This delta is
2628 * used by the caller to adjust the min length of the match, and the delta
2629 * between min and max, so that the optimizer doesn't reject these
2630 * possibilities based on size constraints.
2631 * 2) Certain of these sequences require special handling by the trie code,
2632 * so, if found, this code changes the joined node type to special ops:
2633 * EXACTFU_TRICKYFOLD and EXACTFU_SS.
2634 * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
2635 * is used for an EXACTFU node that contains at least one "ss" sequence in
2636 * it. For non-UTF-8 patterns and strings, this is the only case where
2637 * there is a possible fold length change. That means that a regular
2638 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2639 * with length changes, and so can be processed faster. regexec.c takes
2640 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2641 * pre-folded by regcomp.c. This saves effort in regex matching.
2642 * However, the pre-folding isn't done for non-UTF8 patterns because the
2643 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2644 * down by forcing the pattern into UTF8 unless necessary. Also what
2645 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2646 * possibilities for the non-UTF8 patterns are quite simple, except for
2647 * the sharp s. All the ones that don't involve a UTF-8 target string are
2648 * members of a fold-pair, and arrays are set up for all of them so that
2649 * the other member of the pair can be found quickly. Code elsewhere in
2650 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2651 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2652 * described in the next item.
2653 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2654 * 'ss' or not is not knowable at compile time. It will match iff the
2655 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2656 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2657 * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
2658 * described in item 3). An assumption that the optimizer part of
2659 * regexec.c (probably unwittingly) makes is that a character in the
2660 * pattern corresponds to at most a single character in the target string.
2661 * (And I do mean character, and not byte here, unlike other parts of the
2662 * documentation that have never been updated to account for multibyte
2663 * Unicode.) This assumption is wrong only in this case, as all other
2664 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2665 * virtue of having this file pre-fold UTF-8 patterns. I'm
2666 * reluctant to try to change this assumption, so instead the code punts.
2667 * This routine examines EXACTF nodes for the sharp s, and returns a
2668 * boolean indicating whether or not the node is an EXACTF node that
2669 * contains a sharp s. When it is true, the caller sets a flag that later
2670 * causes the optimizer in this file to not set values for the floating
2671 * and fixed string lengths, and thus avoids the optimizer code in
2672 * regexec.c that makes the invalid assumption. Thus, there is no
2673 * optimization based on string lengths for EXACTF nodes that contain the
2674 * sharp s. This only happens for /id rules (which means the pattern
2678 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
2679 if (PL_regkind[OP(scan)] == EXACT) \
2680 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
2683 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
2684 /* Merge several consecutive EXACTish nodes into one. */
2685 regnode *n = regnext(scan);
2687 regnode *next = scan + NODE_SZ_STR(scan);
2691 regnode *stop = scan;
2692 GET_RE_DEBUG_FLAGS_DECL;
2694 PERL_UNUSED_ARG(depth);
2697 PERL_ARGS_ASSERT_JOIN_EXACT;
2698 #ifndef EXPERIMENTAL_INPLACESCAN
2699 PERL_UNUSED_ARG(flags);
2700 PERL_UNUSED_ARG(val);
2702 DEBUG_PEEP("join",scan,depth);
2704 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2705 * EXACT ones that are mergeable to the current one. */
2707 && (PL_regkind[OP(n)] == NOTHING
2708 || (stringok && OP(n) == OP(scan)))
2710 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2713 if (OP(n) == TAIL || n > next)
2715 if (PL_regkind[OP(n)] == NOTHING) {
2716 DEBUG_PEEP("skip:",n,depth);
2717 NEXT_OFF(scan) += NEXT_OFF(n);
2718 next = n + NODE_STEP_REGNODE;
2725 else if (stringok) {
2726 const unsigned int oldl = STR_LEN(scan);
2727 regnode * const nnext = regnext(n);
2729 /* XXX I (khw) kind of doubt that this works on platforms where
2730 * U8_MAX is above 255 because of lots of other assumptions */
2731 if (oldl + STR_LEN(n) > U8_MAX)
2734 DEBUG_PEEP("merg",n,depth);
2737 NEXT_OFF(scan) += NEXT_OFF(n);
2738 STR_LEN(scan) += STR_LEN(n);
2739 next = n + NODE_SZ_STR(n);
2740 /* Now we can overwrite *n : */
2741 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2749 #ifdef EXPERIMENTAL_INPLACESCAN
2750 if (flags && !NEXT_OFF(n)) {
2751 DEBUG_PEEP("atch", val, depth);
2752 if (reg_off_by_arg[OP(n)]) {
2753 ARG_SET(n, val - n);
2756 NEXT_OFF(n) = val - n;
2764 *has_exactf_sharp_s = FALSE;
2766 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2767 * can now analyze for sequences of problematic code points. (Prior to
2768 * this final joining, sequences could have been split over boundaries, and
2769 * hence missed). The sequences only happen in folding, hence for any
2770 * non-EXACT EXACTish node */
2771 if (OP(scan) != EXACT) {
2772 const U8 * const s0 = (U8*) STRING(scan);
2774 const U8 * const s_end = s0 + STR_LEN(scan);
2776 /* One pass is made over the node's string looking for all the
2777 * possibilities. to avoid some tests in the loop, there are two main
2778 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2782 /* Examine the string for a multi-character fold sequence. UTF-8
2783 * patterns have all characters pre-folded by the time this code is
2785 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2786 length sequence we are looking for is 2 */
2789 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2790 if (! len) { /* Not a multi-char fold: get next char */
2795 /* Nodes with 'ss' require special handling, except for EXACTFL
2796 * and EXACTFA for which there is no multi-char fold to this */
2797 if (len == 2 && *s == 's' && *(s+1) == 's'
2798 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2801 OP(scan) = EXACTFU_SS;
2804 else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
2805 && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
2806 COMBINING_DIAERESIS_UTF8
2807 COMBINING_ACUTE_ACCENT_UTF8,
2809 || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
2810 COMBINING_DIAERESIS_UTF8
2811 COMBINING_ACUTE_ACCENT_UTF8,
2816 /* These two folds require special handling by trie's, so
2817 * change the node type to indicate this. If EXACTFA and
2818 * EXACTFL were ever to be handled by trie's, this would
2819 * have to be changed. If this node has already been
2820 * changed to EXACTFU_SS in this loop, leave it as is. (I
2821 * (khw) think it doesn't matter in regexec.c for UTF
2822 * patterns, but no need to change it */
2823 if (OP(scan) == EXACTFU) {
2824 OP(scan) = EXACTFU_TRICKYFOLD;
2828 else { /* Here is a generic multi-char fold. */
2829 const U8* multi_end = s + len;
2831 /* Count how many characters in it. In the case of /l and
2832 * /aa, no folds which contain ASCII code points are
2833 * allowed, so check for those, and skip if found. (In
2834 * EXACTFL, no folds are allowed to any Latin1 code point,
2835 * not just ASCII. But there aren't any of these
2836 * currently, nor ever likely, so don't take the time to
2837 * test for them. The code that generates the
2838 * is_MULTI_foo() macros croaks should one actually get put
2839 * into Unicode .) */
2840 if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2841 count = utf8_length(s, multi_end);
2845 while (s < multi_end) {
2848 goto next_iteration;
2858 /* The delta is how long the sequence is minus 1 (1 is how long
2859 * the character that folds to the sequence is) */
2860 *min_subtract += count - 1;
2864 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
2866 /* Here, the pattern is not UTF-8. Look for the multi-char folds
2867 * that are all ASCII. As in the above case, EXACTFL and EXACTFA
2868 * nodes can't have multi-char folds to this range (and there are
2869 * no existing ones in the upper latin1 range). In the EXACTF
2870 * case we look also for the sharp s, which can be in the final
2871 * position. Otherwise we can stop looking 1 byte earlier because
2872 * have to find at least two characters for a multi-fold */
2873 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2875 /* The below is perhaps overboard, but this allows us to save a
2876 * test each time through the loop at the expense of a mask. This
2877 * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
2878 * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
2879 * are 64. This uses an exclusive 'or' to find that bit and then
2880 * inverts it to form a mask, with just a single 0, in the bit
2881 * position where 'S' and 's' differ. */
2882 const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
2883 const U8 s_masked = 's' & S_or_s_mask;
2886 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
2887 if (! len) { /* Not a multi-char fold. */
2888 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2890 *has_exactf_sharp_s = TRUE;
2897 && ((*s & S_or_s_mask) == s_masked)
2898 && ((*(s+1) & S_or_s_mask) == s_masked))
2901 /* EXACTF nodes need to know that the minimum length
2902 * changed so that a sharp s in the string can match this
2903 * ss in the pattern, but they remain EXACTF nodes, as they
2904 * won't match this unless the target string is is UTF-8,
2905 * which we don't know until runtime */
2906 if (OP(scan) != EXACTF) {
2907 OP(scan) = EXACTFU_SS;
2911 *min_subtract += len - 1;
2918 /* Allow dumping but overwriting the collection of skipped
2919 * ops and/or strings with fake optimized ops */
2920 n = scan + NODE_SZ_STR(scan);
2928 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2932 /* REx optimizer. Converts nodes into quicker variants "in place".
2933 Finds fixed substrings. */
2935 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
2936 to the position after last scanned or to NULL. */
2938 #define INIT_AND_WITHP \
2939 assert(!and_withp); \
2940 Newx(and_withp,1,struct regnode_charclass_class); \
2941 SAVEFREEPV(and_withp)
2943 /* this is a chain of data about sub patterns we are processing that
2944 need to be handled separately/specially in study_chunk. Its so
2945 we can simulate recursion without losing state. */
2947 typedef struct scan_frame {
2948 regnode *last; /* last node to process in this frame */
2949 regnode *next; /* next node to process when last is reached */
2950 struct scan_frame *prev; /*previous frame*/
2951 I32 stop; /* what stopparen do we use */
2955 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2957 #define CASE_SYNST_FNC(nAmE) \
2959 if (flags & SCF_DO_STCLASS_AND) { \
2960 for (value = 0; value < 256; value++) \
2961 if (!is_ ## nAmE ## _cp(value)) \
2962 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2965 for (value = 0; value < 256; value++) \
2966 if (is_ ## nAmE ## _cp(value)) \
2967 ANYOF_BITMAP_SET(data->start_class, value); \
2971 if (flags & SCF_DO_STCLASS_AND) { \
2972 for (value = 0; value < 256; value++) \
2973 if (is_ ## nAmE ## _cp(value)) \
2974 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2977 for (value = 0; value < 256; value++) \
2978 if (!is_ ## nAmE ## _cp(value)) \
2979 ANYOF_BITMAP_SET(data->start_class, value); \
2986 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
2987 I32 *minlenp, I32 *deltap,
2992 struct regnode_charclass_class *and_withp,
2993 U32 flags, U32 depth)
2994 /* scanp: Start here (read-write). */
2995 /* deltap: Write maxlen-minlen here. */
2996 /* last: Stop before this one. */
2997 /* data: string data about the pattern */
2998 /* stopparen: treat close N as END */
2999 /* recursed: which subroutines have we recursed into */
3000 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3003 I32 min = 0; /* There must be at least this number of characters to match */
3005 regnode *scan = *scanp, *next;
3007 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3008 int is_inf_internal = 0; /* The studied chunk is infinite */
3009 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3010 scan_data_t data_fake;
3011 SV *re_trie_maxbuff = NULL;
3012 regnode *first_non_open = scan;
3013 I32 stopmin = I32_MAX;
3014 scan_frame *frame = NULL;
3015 GET_RE_DEBUG_FLAGS_DECL;
3017 PERL_ARGS_ASSERT_STUDY_CHUNK;
3020 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3024 while (first_non_open && OP(first_non_open) == OPEN)
3025 first_non_open=regnext(first_non_open);
3030 while ( scan && OP(scan) != END && scan < last ){
3031 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3032 node length to get a real minimum (because
3033 the folded version may be shorter) */
3034 bool has_exactf_sharp_s = FALSE;
3035 /* Peephole optimizer: */
3036 DEBUG_STUDYDATA("Peep:", data,depth);
3037 DEBUG_PEEP("Peep",scan,depth);
3039 /* Its not clear to khw or hv why this is done here, and not in the
3040 * clauses that deal with EXACT nodes. khw's guess is that it's
3041 * because of a previous design */
3042 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
3044 /* Follow the next-chain of the current node and optimize
3045 away all the NOTHINGs from it. */
3046 if (OP(scan) != CURLYX) {
3047 const int max = (reg_off_by_arg[OP(scan)]
3049 /* I32 may be smaller than U16 on CRAYs! */
3050 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3051 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3055 /* Skip NOTHING and LONGJMP. */
3056 while ((n = regnext(n))
3057 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3058 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3059 && off + noff < max)
3061 if (reg_off_by_arg[OP(scan)])
3064 NEXT_OFF(scan) = off;
3069 /* The principal pseudo-switch. Cannot be a switch, since we
3070 look into several different things. */
3071 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3072 || OP(scan) == IFTHEN) {
3073 next = regnext(scan);
3075 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
3077 if (OP(next) == code || code == IFTHEN) {
3078 /* NOTE - There is similar code to this block below for handling
3079 TRIE nodes on a re-study. If you change stuff here check there
3081 I32 max1 = 0, min1 = I32_MAX, num = 0;
3082 struct regnode_charclass_class accum;
3083 regnode * const startbranch=scan;
3085 if (flags & SCF_DO_SUBSTR)
3086 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
3087 if (flags & SCF_DO_STCLASS)
3088 cl_init_zero(pRExC_state, &accum);
3090 while (OP(scan) == code) {
3091 I32 deltanext, minnext, f = 0, fake;
3092 struct regnode_charclass_class this_class;
3095 data_fake.flags = 0;
3097 data_fake.whilem_c = data->whilem_c;
3098 data_fake.last_closep = data->last_closep;
3101 data_fake.last_closep = &fake;
3103 data_fake.pos_delta = delta;
3104 next = regnext(scan);
3105 scan = NEXTOPER(scan);
3107 scan = NEXTOPER(scan);
3108 if (flags & SCF_DO_STCLASS) {
3109 cl_init(pRExC_state, &this_class);
3110 data_fake.start_class = &this_class;
3111 f = SCF_DO_STCLASS_AND;
3113 if (flags & SCF_WHILEM_VISITED_POS)
3114 f |= SCF_WHILEM_VISITED_POS;
3116 /* we suppose the run is continuous, last=next...*/
3117 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3119 stopparen, recursed, NULL, f,depth+1);
3122 if (max1 < minnext + deltanext)
3123 max1 = minnext + deltanext;
3124 if (deltanext == I32_MAX)
3125 is_inf = is_inf_internal = 1;
3127 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3129 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3130 if ( stopmin > minnext)
3131 stopmin = min + min1;
3132 flags &= ~SCF_DO_SUBSTR;
3134 data->flags |= SCF_SEEN_ACCEPT;
3137 if (data_fake.flags & SF_HAS_EVAL)
3138 data->flags |= SF_HAS_EVAL;
3139 data->whilem_c = data_fake.whilem_c;
3141 if (flags & SCF_DO_STCLASS)
3142 cl_or(pRExC_state, &accum, &this_class);
3144 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3146 if (flags & SCF_DO_SUBSTR) {
3147 data->pos_min += min1;
3148 data->pos_delta += max1 - min1;
3149 if (max1 != min1 || is_inf)
3150 data->longest = &(data->longest_float);
3153 delta += max1 - min1;
3154 if (flags & SCF_DO_STCLASS_OR) {
3155 cl_or(pRExC_state, data->start_class, &accum);
3157 cl_and(data->start_class, and_withp);
3158 flags &= ~SCF_DO_STCLASS;
3161 else if (flags & SCF_DO_STCLASS_AND) {
3163 cl_and(data->start_class, &accum);
3164 flags &= ~SCF_DO_STCLASS;
3167 /* Switch to OR mode: cache the old value of
3168 * data->start_class */
3170 StructCopy(data->start_class, and_withp,
3171 struct regnode_charclass_class);
3172 flags &= ~SCF_DO_STCLASS_AND;
3173 StructCopy(&accum, data->start_class,
3174 struct regnode_charclass_class);
3175 flags |= SCF_DO_STCLASS_OR;
3176 data->start_class->flags |= ANYOF_EOS;
3180 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3183 Assuming this was/is a branch we are dealing with: 'scan' now
3184 points at the item that follows the branch sequence, whatever
3185 it is. We now start at the beginning of the sequence and look
3192 which would be constructed from a pattern like /A|LIST|OF|WORDS/
3194 If we can find such a subsequence we need to turn the first
3195 element into a trie and then add the subsequent branch exact
3196 strings to the trie.
3200 1. patterns where the whole set of branches can be converted.
3202 2. patterns where only a subset can be converted.
3204 In case 1 we can replace the whole set with a single regop
3205 for the trie. In case 2 we need to keep the start and end
3208 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3209 becomes BRANCH TRIE; BRANCH X;
3211 There is an additional case, that being where there is a
3212 common prefix, which gets split out into an EXACT like node
3213 preceding the TRIE node.
3215 If x(1..n)==tail then we can do a simple trie, if not we make
3216 a "jump" trie, such that when we match the appropriate word
3217 we "jump" to the appropriate tail node. Essentially we turn
3218 a nested if into a case structure of sorts.
3223 if (!re_trie_maxbuff) {
3224 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3225 if (!SvIOK(re_trie_maxbuff))
3226 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3228 if ( SvIV(re_trie_maxbuff)>=0 ) {
3230 regnode *first = (regnode *)NULL;
3231 regnode *last = (regnode *)NULL;
3232 regnode *tail = scan;
3237 SV * const mysv = sv_newmortal(); /* for dumping */
3239 /* var tail is used because there may be a TAIL
3240 regop in the way. Ie, the exacts will point to the
3241 thing following the TAIL, but the last branch will
3242 point at the TAIL. So we advance tail. If we
3243 have nested (?:) we may have to move through several
3247 while ( OP( tail ) == TAIL ) {
3248 /* this is the TAIL generated by (?:) */
3249 tail = regnext( tail );
3253 DEBUG_TRIE_COMPILE_r({
3254 regprop(RExC_rx, mysv, tail );
3255 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3256 (int)depth * 2 + 2, "",
3257 "Looking for TRIE'able sequences. Tail node is: ",
3258 SvPV_nolen_const( mysv )
3264 Step through the branches
3265 cur represents each branch,
3266 noper is the first thing to be matched as part of that branch
3267 noper_next is the regnext() of that node.
3269 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3270 via a "jump trie" but we also support building with NOJUMPTRIE,
3271 which restricts the trie logic to structures like /FOO|BAR/.
3273 If noper is a trieable nodetype then the branch is a possible optimization
3274 target. If we are building under NOJUMPTRIE then we require that noper_next
3275 is the same as scan (our current position in the regex program).
3277 Once we have two or more consecutive such branches we can create a
3278 trie of the EXACT's contents and stitch it in place into the program.
3280 If the sequence represents all of the branches in the alternation we
3281 replace the entire thing with a single TRIE node.
3283 Otherwise when it is a subsequence we need to stitch it in place and
3284 replace only the relevant branches. This means the first branch has
3285 to remain as it is used by the alternation logic, and its next pointer,
3286 and needs to be repointed at the item on the branch chain following
3287 the last branch we have optimized away.
3289 This could be either a BRANCH, in which case the subsequence is internal,
3290 or it could be the item following the branch sequence in which case the
3291 subsequence is at the end (which does not necessarily mean the first node
3292 is the start of the alternation).
3294 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3297 ----------------+-----------
3301 EXACTFU_SS | EXACTFU
3302 EXACTFU_TRICKYFOLD | EXACTFU
3307 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3308 ( EXACT == (X) ) ? EXACT : \
3309 ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
3312 /* dont use tail as the end marker for this traverse */
3313 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3314 regnode * const noper = NEXTOPER( cur );
3315 U8 noper_type = OP( noper );
3316 U8 noper_trietype = TRIE_TYPE( noper_type );
3317 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3318 regnode * const noper_next = regnext( noper );
3319 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3320 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3323 DEBUG_TRIE_COMPILE_r({
3324 regprop(RExC_rx, mysv, cur);
3325 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3326 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3328 regprop(RExC_rx, mysv, noper);
3329 PerlIO_printf( Perl_debug_log, " -> %s",
3330 SvPV_nolen_const(mysv));
3333 regprop(RExC_rx, mysv, noper_next );
3334 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3335 SvPV_nolen_const(mysv));
3337 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3338 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3339 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3343 /* Is noper a trieable nodetype that can be merged with the
3344 * current trie (if there is one)? */
3348 ( noper_trietype == NOTHING)
3349 || ( trietype == NOTHING )
3350 || ( trietype == noper_trietype )
3353 && noper_next == tail
3357 /* Handle mergable triable node
3358 * Either we are the first node in a new trieable sequence,
3359 * in which case we do some bookkeeping, otherwise we update
3360 * the end pointer. */
3363 if ( noper_trietype == NOTHING ) {
3364 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3365 regnode * const noper_next = regnext( noper );
3366 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3367 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3370 if ( noper_next_trietype ) {
3371 trietype = noper_next_trietype;
3372 } else if (noper_next_type) {
3373 /* a NOTHING regop is 1 regop wide. We need at least two
3374 * for a trie so we can't merge this in */
3378 trietype = noper_trietype;
3381 if ( trietype == NOTHING )
3382 trietype = noper_trietype;
3387 } /* end handle mergable triable node */
3389 /* handle unmergable node -
3390 * noper may either be a triable node which can not be tried
3391 * together with the current trie, or a non triable node */
3393 /* If last is set and trietype is not NOTHING then we have found
3394 * at least two triable branch sequences in a row of a similar
3395 * trietype so we can turn them into a trie. If/when we
3396 * allow NOTHING to start a trie sequence this condition will be
3397 * required, and it isn't expensive so we leave it in for now. */
3398 if ( trietype && trietype != NOTHING )
3399 make_trie( pRExC_state,
3400 startbranch, first, cur, tail, count,
3401 trietype, depth+1 );
3402 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
3406 && noper_next == tail
3409 /* noper is triable, so we can start a new trie sequence */
3412 trietype = noper_trietype;
3414 /* if we already saw a first but the current node is not triable then we have
3415 * to reset the first information. */
3420 } /* end handle unmergable node */
3421 } /* loop over branches */
3422 DEBUG_TRIE_COMPILE_r({
3423 regprop(RExC_rx, mysv, cur);
3424 PerlIO_printf( Perl_debug_log,
3425 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3426 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3429 if ( last && trietype ) {
3430 if ( trietype != NOTHING ) {
3431 /* the last branch of the sequence was part of a trie,
3432 * so we have to construct it here outside of the loop
3434 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
3435 #ifdef TRIE_STUDY_OPT
3436 if ( ((made == MADE_EXACT_TRIE &&
3437 startbranch == first)
3438 || ( first_non_open == first )) &&
3440 flags |= SCF_TRIE_RESTUDY;
3441 if ( startbranch == first
3444 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3449 /* at this point we know whatever we have is a NOTHING sequence/branch
3450 * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
3452 if ( startbranch == first ) {
3454 /* the entire thing is a NOTHING sequence, something like this:
3455 * (?:|) So we can turn it into a plain NOTHING op. */
3456 DEBUG_TRIE_COMPILE_r({
3457 regprop(RExC_rx, mysv, cur);
3458 PerlIO_printf( Perl_debug_log,
3459 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
3460 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3463 OP(startbranch)= NOTHING;
3464 NEXT_OFF(startbranch)= tail - startbranch;
3465 for ( opt= startbranch + 1; opt < tail ; opt++ )
3469 } /* end if ( last) */
3470 } /* TRIE_MAXBUF is non zero */
3475 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3476 scan = NEXTOPER(NEXTOPER(scan));
3477 } else /* single branch is optimized. */
3478 scan = NEXTOPER(scan);
3480 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3481 scan_frame *newframe = NULL;
3486 if (OP(scan) != SUSPEND) {
3487 /* set the pointer */
3488 if (OP(scan) == GOSUB) {
3490 RExC_recurse[ARG2L(scan)] = scan;
3491 start = RExC_open_parens[paren-1];
3492 end = RExC_close_parens[paren-1];
3495 start = RExC_rxi->program + 1;
3499 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3500 SAVEFREEPV(recursed);
3502 if (!PAREN_TEST(recursed,paren+1)) {
3503 PAREN_SET(recursed,paren+1);
3504 Newx(newframe,1,scan_frame);
3506 if (flags & SCF_DO_SUBSTR) {
3507 SCAN_COMMIT(pRExC_state,data,minlenp);
3508 data->longest = &(data->longest_float);
3510 is_inf = is_inf_internal = 1;
3511 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3512 cl_anything(pRExC_state, data->start_class);
3513 flags &= ~SCF_DO_STCLASS;
3516 Newx(newframe,1,scan_frame);
3519 end = regnext(scan);
3524 SAVEFREEPV(newframe);
3525 newframe->next = regnext(scan);
3526 newframe->last = last;
3527 newframe->stop = stopparen;
3528 newframe->prev = frame;
3538 else if (OP(scan) == EXACT) {
3539 I32 l = STR_LEN(scan);
3542 const U8 * const s = (U8*)STRING(scan);
3543 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3544 l = utf8_length(s, s + l);
3546 uc = *((U8*)STRING(scan));
3549 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3550 /* The code below prefers earlier match for fixed
3551 offset, later match for variable offset. */
3552 if (data->last_end == -1) { /* Update the start info. */
3553 data->last_start_min = data->pos_min;
3554 data->last_start_max = is_inf
3555 ? I32_MAX : data->pos_min + data->pos_delta;
3557 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3559 SvUTF8_on(data->last_found);
3561 SV * const sv = data->last_found;
3562 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3563 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3564 if (mg && mg->mg_len >= 0)
3565 mg->mg_len += utf8_length((U8*)STRING(scan),
3566 (U8*)STRING(scan)+STR_LEN(scan));
3568 data->last_end = data->pos_min + l;
3569 data->pos_min += l; /* As in the first entry. */
3570 data->flags &= ~SF_BEFORE_EOL;
3572 if (flags & SCF_DO_STCLASS_AND) {
3573 /* Check whether it is compatible with what we know already! */
3577 /* If compatible, we or it in below. It is compatible if is
3578 * in the bitmp and either 1) its bit or its fold is set, or 2)
3579 * it's for a locale. Even if there isn't unicode semantics
3580 * here, at runtime there may be because of matching against a
3581 * utf8 string, so accept a possible false positive for
3582 * latin1-range folds */
3584 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3585 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3586 && (!(data->start_class->flags & ANYOF_LOC_FOLD)
3587 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3592 ANYOF_CLASS_ZERO(data->start_class);
3593 ANYOF_BITMAP_ZERO(data->start_class);
3595 ANYOF_BITMAP_SET(data->start_class, uc);
3596 else if (uc >= 0x100) {
3599 /* Some Unicode code points fold to the Latin1 range; as
3600 * XXX temporary code, instead of figuring out if this is
3601 * one, just assume it is and set all the start class bits
3602 * that could be some such above 255 code point's fold
3603 * which will generate fals positives. As the code
3604 * elsewhere that does compute the fold settles down, it
3605 * can be extracted out and re-used here */
3606 for (i = 0; i < 256; i++){
3607 if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
3608 ANYOF_BITMAP_SET(data->start_class, i);
3612 data->start_class->flags &= ~ANYOF_EOS;
3614 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3616 else if (flags & SCF_DO_STCLASS_OR) {
3617 /* false positive possible if the class is case-folded */
3619 ANYOF_BITMAP_SET(data->start_class, uc);
3621 data->start_class->flags |= ANYOF_UNICODE_ALL;
3622 data->start_class->flags &= ~ANYOF_EOS;
3623 cl_and(data->start_class, and_withp);
3625 flags &= ~SCF_DO_STCLASS;
3627 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3628 I32 l = STR_LEN(scan);
3629 UV uc = *((U8*)STRING(scan));
3631 /* Search for fixed substrings supports EXACT only. */
3632 if (flags & SCF_DO_SUBSTR) {
3634 SCAN_COMMIT(pRExC_state, data, minlenp);
3637 const U8 * const s = (U8 *)STRING(scan);
3638 uc = utf8_to_uvchr_buf(s, s + l, NULL);
3639 l = utf8_length(s, s + l);
3641 if (has_exactf_sharp_s) {
3642 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
3644 min += l - min_subtract;
3646 delta += min_subtract;
3647 if (flags & SCF_DO_SUBSTR) {
3648 data->pos_min += l - min_subtract;
3649 if (data->pos_min < 0) {
3652 data->pos_delta += min_subtract;
3654 data->longest = &(data->longest_float);
3657 if (flags & SCF_DO_STCLASS_AND) {
3658 /* Check whether it is compatible with what we know already! */
3661 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3662 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3663 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3667 ANYOF_CLASS_ZERO(data->start_class);
3668 ANYOF_BITMAP_ZERO(data->start_class);
3670 ANYOF_BITMAP_SET(data->start_class, uc);
3671 data->start_class->flags &= ~ANYOF_EOS;
3672 if (OP(scan) == EXACTFL) {
3673 /* XXX This set is probably no longer necessary, and
3674 * probably wrong as LOCALE now is on in the initial
3676 data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
3680 /* Also set the other member of the fold pair. In case
3681 * that unicode semantics is called for at runtime, use
3682 * the full latin1 fold. (Can't do this for locale,
3683 * because not known until runtime) */
3684 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
3686 /* All other (EXACTFL handled above) folds except under
3687 * /iaa that include s, S, and sharp_s also may include
3689 if (OP(scan) != EXACTFA) {
3690 if (uc == 's' || uc == 'S') {
3691 ANYOF_BITMAP_SET(data->start_class,
3692 LATIN_SMALL_LETTER_SHARP_S);
3694 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3695 ANYOF_BITMAP_SET(data->start_class, 's');
3696 ANYOF_BITMAP_SET(data->start_class, 'S');
3701 else if (uc >= 0x100) {
3703 for (i = 0; i < 256; i++){
3704 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3705 ANYOF_BITMAP_SET(data->start_class, i);
3710 else if (flags & SCF_DO_STCLASS_OR) {
3711 if (data->start_class->flags & ANYOF_LOC_FOLD) {
3712 /* false positive possible if the class is case-folded.
3713 Assume that the locale settings are the same... */
3715 ANYOF_BITMAP_SET(data->start_class, uc);
3716 if (OP(scan) != EXACTFL) {
3718 /* And set the other member of the fold pair, but
3719 * can't do that in locale because not known until
3721 ANYOF_BITMAP_SET(data->start_class,
3722 PL_fold_latin1[uc]);
3724 /* All folds except under /iaa that include s, S,
3725 * and sharp_s also may include the others */
3726 if (OP(scan) != EXACTFA) {
3727 if (uc == 's' || uc == 'S') {
3728 ANYOF_BITMAP_SET(data->start_class,
3729 LATIN_SMALL_LETTER_SHARP_S);
3731 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3732 ANYOF_BITMAP_SET(data->start_class, 's');
3733 ANYOF_BITMAP_SET(data->start_class, 'S');
3738 data->start_class->flags &= ~ANYOF_EOS;
3740 cl_and(data->start_class, and_withp);
3742 flags &= ~SCF_DO_STCLASS;
3744 else if (REGNODE_VARIES(OP(scan))) {
3745 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3746 I32 f = flags, pos_before = 0;
3747 regnode * const oscan = scan;
3748 struct regnode_charclass_class this_class;
3749 struct regnode_charclass_class *oclass = NULL;
3750 I32 next_is_eval = 0;
3752 switch (PL_regkind[OP(scan)]) {
3753 case WHILEM: /* End of (?:...)* . */
3754 scan = NEXTOPER(scan);
3757 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3758 next = NEXTOPER(scan);
3759 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3761 maxcount = REG_INFTY;
3762 next = regnext(scan);
3763 scan = NEXTOPER(scan);
3767 if (flags & SCF_DO_SUBSTR)
3772 if (flags & SCF_DO_STCLASS) {
3774 maxcount = REG_INFTY;
3775 next = regnext(scan);
3776 scan = NEXTOPER(scan);
3779 is_inf = is_inf_internal = 1;
3780 scan = regnext(scan);
3781 if (flags & SCF_DO_SUBSTR) {
3782 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
3783 data->longest = &(data->longest_float);
3785 goto optimize_curly_tail;
3787 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3788 && (scan->flags == stopparen))
3793 mincount = ARG1(scan);
3794 maxcount = ARG2(scan);
3796 next = regnext(scan);
3797 if (OP(scan) == CURLYX) {
3798 I32 lp = (data ? *(data->last_closep) : 0);
3799 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
3801 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3802 next_is_eval = (OP(scan) == EVAL);
3804 if (flags & SCF_DO_SUBSTR) {
3805 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
3806 pos_before = data->pos_min;
3810 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3812 data->flags |= SF_IS_INF;
3814 if (flags & SCF_DO_STCLASS) {
3815 cl_init(pRExC_state, &this_class);
3816 oclass = data->start_class;
3817 data->start_class = &this_class;
3818 f |= SCF_DO_STCLASS_AND;
3819 f &= ~SCF_DO_STCLASS_OR;
3821 /* Exclude from super-linear cache processing any {n,m}
3822 regops for which the combination of input pos and regex
3823 pos is not enough information to determine if a match
3826 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3827 regex pos at the \s*, the prospects for a match depend not
3828 only on the input position but also on how many (bar\s*)
3829 repeats into the {4,8} we are. */
3830 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
3831 f &= ~SCF_WHILEM_VISITED_POS;
3833 /* This will finish on WHILEM, setting scan, or on NULL: */
3834 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3835 last, data, stopparen, recursed, NULL,
3837 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
3839 if (flags & SCF_DO_STCLASS)
3840 data->start_class = oclass;
3841 if (mincount == 0 || minnext == 0) {
3842 if (flags & SCF_DO_STCLASS_OR) {
3843 cl_or(pRExC_state, data->start_class, &this_class);
3845 else if (flags & SCF_DO_STCLASS_AND) {
3846 /* Switch to OR mode: cache the old value of
3847 * data->start_class */
3849 StructCopy(data->start_class, and_withp,
3850 struct regnode_charclass_class);
3851 flags &= ~SCF_DO_STCLASS_AND;
3852 StructCopy(&this_class, data->start_class,
3853 struct regnode_charclass_class);
3854 flags |= SCF_DO_STCLASS_OR;
3855 data->start_class->flags |= ANYOF_EOS;
3857 } else { /* Non-zero len */
3858 if (flags & SCF_DO_STCLASS_OR) {
3859 cl_or(pRExC_state, data->start_class, &this_class);
3860 cl_and(data->start_class, and_withp);
3862 else if (flags & SCF_DO_STCLASS_AND)
3863 cl_and(data->start_class, &this_class);
3864 flags &= ~SCF_DO_STCLASS;
3866 if (!scan) /* It was not CURLYX, but CURLY. */
3868 if ( /* ? quantifier ok, except for (?{ ... }) */
3869 (next_is_eval || !(mincount == 0 && maxcount == 1))
3870 && (minnext == 0) && (deltanext == 0)
3871 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3872 && maxcount <= REG_INFTY/3) /* Complement check for big count */
3874 ckWARNreg(RExC_parse,
3875 "Quantifier unexpected on zero-length expression");
3878 min += minnext * mincount;
3879 is_inf_internal |= ((maxcount == REG_INFTY
3880 && (minnext + deltanext) > 0)
3881 || deltanext == I32_MAX);
3882 is_inf |= is_inf_internal;
3883 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3885 /* Try powerful optimization CURLYX => CURLYN. */
3886 if ( OP(oscan) == CURLYX && data
3887 && data->flags & SF_IN_PAR
3888 && !(data->flags & SF_HAS_EVAL)
3889 && !deltanext && minnext == 1 ) {
3890 /* Try to optimize to CURLYN. */
3891 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3892 regnode * const nxt1 = nxt;
3899 if (!REGNODE_SIMPLE(OP(nxt))
3900 && !(PL_regkind[OP(nxt)] == EXACT
3901 && STR_LEN(nxt) == 1))
3907 if (OP(nxt) != CLOSE)
3909 if (RExC_open_parens) {
3910 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3911 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3913 /* Now we know that nxt2 is the only contents: */
3914 oscan->flags = (U8)ARG(nxt);
3916 OP(nxt1) = NOTHING; /* was OPEN. */
3919 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3920 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3921 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
3922 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3923 OP(nxt + 1) = OPTIMIZED; /* was count. */
3924 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
3929 /* Try optimization CURLYX => CURLYM. */
3930 if ( OP(oscan) == CURLYX && data
3931 && !(data->flags & SF_HAS_PAR)
3932 && !(data->flags & SF_HAS_EVAL)
3933 && !deltanext /* atom is fixed width */
3934 && minnext != 0 /* CURLYM can't handle zero width */
3935 && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
3937 /* XXXX How to optimize if data == 0? */
3938 /* Optimize to a simpler form. */
3939 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3943 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3944 && (OP(nxt2) != WHILEM))
3946 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3947 /* Need to optimize away parenths. */
3948 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
3949 /* Set the parenth number. */
3950 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3952 oscan->flags = (U8)ARG(nxt);
3953 if (RExC_open_parens) {
3954 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3955 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
3957 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3958 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3961 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3962 OP(nxt + 1) = OPTIMIZED; /* was count. */
3963 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3964 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
3967 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3968 regnode *nnxt = regnext(nxt1);
3970 if (reg_off_by_arg[OP(nxt1)])
3971 ARG_SET(nxt1, nxt2 - nxt1);
3972 else if (nxt2 - nxt1 < U16_MAX)
3973 NEXT_OFF(nxt1) = nxt2 - nxt1;
3975 OP(nxt) = NOTHING; /* Cannot beautify */
3980 /* Optimize again: */
3981 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3982 NULL, stopparen, recursed, NULL, 0,depth+1);
3987 else if ((OP(oscan) == CURLYX)
3988 && (flags & SCF_WHILEM_VISITED_POS)
3989 /* See the comment on a similar expression above.
3990 However, this time it's not a subexpression
3991 we care about, but the expression itself. */
3992 && (maxcount == REG_INFTY)
3993 && data && ++data->whilem_c < 16) {
3994 /* This stays as CURLYX, we can put the count/of pair. */
3995 /* Find WHILEM (as in regexec.c) */
3996 regnode *nxt = oscan + NEXT_OFF(oscan);
3998 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4000 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4001 | (RExC_whilem_seen << 4)); /* On WHILEM */
4003 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4005 if (flags & SCF_DO_SUBSTR) {
4006 SV *last_str = NULL;
4007 int counted = mincount != 0;
4009 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
4010 #if defined(SPARC64_GCC_WORKAROUND)
4013 const char *s = NULL;
4016 if (pos_before >= data->last_start_min)
4019 b = data->last_start_min;
4022 s = SvPV_const(data->last_found, l);
4023 old = b - data->last_start_min;
4026 I32 b = pos_before >= data->last_start_min
4027 ? pos_before : data->last_start_min;
4029 const char * const s = SvPV_const(data->last_found, l);
4030 I32 old = b - data->last_start_min;
4034 old = utf8_hop((U8*)s, old) - (U8*)s;
4036 /* Get the added string: */
4037 last_str = newSVpvn_utf8(s + old, l, UTF);
4038 if (deltanext == 0 && pos_before == b) {
4039 /* What was added is a constant string */
4041 SvGROW(last_str, (mincount * l) + 1);
4042 repeatcpy(SvPVX(last_str) + l,
4043 SvPVX_const(last_str), l, mincount - 1);
4044 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4045 /* Add additional parts. */
4046 SvCUR_set(data->last_found,
4047 SvCUR(data->last_found) - l);
4048 sv_catsv(data->last_found, last_str);
4050 SV * sv = data->last_found;
4052 SvUTF8(sv) && SvMAGICAL(sv) ?
4053 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4054 if (mg && mg->mg_len >= 0)
4055 mg->mg_len += CHR_SVLEN(last_str) - l;
4057 data->last_end += l * (mincount - 1);
4060 /* start offset must point into the last copy */
4061 data->last_start_min += minnext * (mincount - 1);
4062 data->last_start_max += is_inf ? I32_MAX
4063 : (maxcount - 1) * (minnext + data->pos_delta);
4066 /* It is counted once already... */
4067 data->pos_min += minnext * (mincount - counted);
4068 data->pos_delta += - counted * deltanext +
4069 (minnext + deltanext) * maxcount - minnext * mincount;
4070 if (mincount != maxcount) {
4071 /* Cannot extend fixed substrings found inside
4073 SCAN_COMMIT(pRExC_state,data,minlenp);
4074 if (mincount && last_str) {
4075 SV * const sv = data->last_found;
4076 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4077 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4081 sv_setsv(sv, last_str);
4082 data->last_end = data->pos_min;
4083 data->last_start_min =
4084 data->pos_min - CHR_SVLEN(last_str);
4085 data->last_start_max = is_inf
4087 : data->pos_min + data->pos_delta
4088 - CHR_SVLEN(last_str);
4090 data->longest = &(data->longest_float);
4092 SvREFCNT_dec(last_str);
4094 if (data && (fl & SF_HAS_EVAL))
4095 data->flags |= SF_HAS_EVAL;
4096 optimize_curly_tail:
4097 if (OP(oscan) != CURLYX) {
4098 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4100 NEXT_OFF(oscan) += NEXT_OFF(next);
4103 default: /* REF, ANYOFV, and CLUMP only? */
4104 if (flags & SCF_DO_SUBSTR) {
4105 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4106 data->longest = &(data->longest_float);
4108 is_inf = is_inf_internal = 1;
4109 if (flags & SCF_DO_STCLASS_OR)
4110 cl_anything(pRExC_state, data->start_class);
4111 flags &= ~SCF_DO_STCLASS;
4115 else if (OP(scan) == LNBREAK) {
4116 if (flags & SCF_DO_STCLASS) {
4118 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4119 if (flags & SCF_DO_STCLASS_AND) {
4120 for (value = 0; value < 256; value++)
4121 if (!is_VERTWS_cp(value))
4122 ANYOF_BITMAP_CLEAR(data->start_class, value);
4125 for (value = 0; value < 256; value++)
4126 if (is_VERTWS_cp(value))
4127 ANYOF_BITMAP_SET(data->start_class, value);
4129 if (flags & SCF_DO_STCLASS_OR)
4130 cl_and(data->start_class, and_withp);
4131 flags &= ~SCF_DO_STCLASS;
4134 delta++; /* Because of the 2 char string cr-lf */
4135 if (flags & SCF_DO_SUBSTR) {
4136 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4138 data->pos_delta += 1;
4139 data->longest = &(data->longest_float);
4142 else if (REGNODE_SIMPLE(OP(scan))) {
4145 if (flags & SCF_DO_SUBSTR) {
4146 SCAN_COMMIT(pRExC_state,data,minlenp);
4150 if (flags & SCF_DO_STCLASS) {
4151 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
4153 /* Some of the logic below assumes that switching
4154 locale on will only add false positives. */
4155 switch (PL_regkind[OP(scan)]) {
4159 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4160 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4161 cl_anything(pRExC_state, data->start_class);
4164 if (OP(scan) == SANY)
4166 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4167 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
4168 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
4169 cl_anything(pRExC_state, data->start_class);
4171 if (flags & SCF_DO_STCLASS_AND || !value)
4172 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4175 if (flags & SCF_DO_STCLASS_AND)
4176 cl_and(data->start_class,
4177 (struct regnode_charclass_class*)scan);
4179 cl_or(pRExC_state, data->start_class,
4180 (struct regnode_charclass_class*)scan);
4183 if (flags & SCF_DO_STCLASS_AND) {
4184 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4185 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
4186 if (OP(scan) == ALNUMU) {
4187 for (value = 0; value < 256; value++) {
4188 if (!isWORDCHAR_L1(value)) {
4189 ANYOF_BITMAP_CLEAR(data->start_class, value);
4193 for (value = 0; value < 256; value++) {
4194 if (!isALNUM(value)) {
4195 ANYOF_BITMAP_CLEAR(data->start_class, value);
4202 if (data->start_class->flags & ANYOF_LOCALE)
4203 ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
4205 /* Even if under locale, set the bits for non-locale
4206 * in case it isn't a true locale-node. This will
4207 * create false positives if it truly is locale */
4208 if (OP(scan) == ALNUMU) {
4209 for (value = 0; value < 256; value++) {
4210 if (isWORDCHAR_L1(value)) {
4211 ANYOF_BITMAP_SET(data->start_class, value);
4215 for (value = 0; value < 256; value++) {
4216 if (isALNUM(value)) {
4217 ANYOF_BITMAP_SET(data->start_class, value);
4224 if (flags & SCF_DO_STCLASS_AND) {
4225 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4226 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
4227 if (OP(scan) == NALNUMU) {
4228 for (value = 0; value < 256; value++) {
4229 if (isWORDCHAR_L1(value)) {
4230 ANYOF_BITMAP_CLEAR(data->start_class, value);
4234 for (value = 0; value < 256; value++) {
4235 if (isALNUM(value)) {
4236 ANYOF_BITMAP_CLEAR(data->start_class, value);
4243 if (data->start_class->flags & ANYOF_LOCALE)
4244 ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
4246 /* Even if under locale, set the bits for non-locale in
4247 * case it isn't a true locale-node. This will create
4248 * false positives if it truly is locale */
4249 if (OP(scan) == NALNUMU) {
4250 for (value = 0; value < 256; value++) {
4251 if (! isWORDCHAR_L1(value)) {
4252 ANYOF_BITMAP_SET(data->start_class, value);
4256 for (value = 0; value < 256; value++) {
4257 if (! isALNUM(value)) {
4258 ANYOF_BITMAP_SET(data->start_class, value);
4265 if (flags & SCF_DO_STCLASS_AND) {
4266 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4267 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
4268 if (OP(scan) == SPACEU) {
4269 for (value = 0; value < 256; value++) {
4270 if (!isSPACE_L1(value)) {
4271 ANYOF_BITMAP_CLEAR(data->start_class, value);
4275 for (value = 0; value < 256; value++) {
4276 if (!isSPACE(value)) {
4277 ANYOF_BITMAP_CLEAR(data->start_class, value);
4284 if (data->start_class->flags & ANYOF_LOCALE) {
4285 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
4287 if (OP(scan) == SPACEU) {
4288 for (value = 0; value < 256; value++) {
4289 if (isSPACE_L1(value)) {
4290 ANYOF_BITMAP_SET(data->start_class, value);
4294 for (value = 0; value < 256; value++) {
4295 if (isSPACE(value)) {
4296 ANYOF_BITMAP_SET(data->start_class, value);
4303 if (flags & SCF_DO_STCLASS_AND) {
4304 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4305 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
4306 if (OP(scan) == NSPACEU) {
4307 for (value = 0; value < 256; value++) {
4308 if (isSPACE_L1(value)) {
4309 ANYOF_BITMAP_CLEAR(data->start_class, value);
4313 for (value = 0; value < 256; value++) {
4314 if (isSPACE(value)) {
4315 ANYOF_BITMAP_CLEAR(data->start_class, value);
4322 if (data->start_class->flags & ANYOF_LOCALE)
4323 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
4324 if (OP(scan) == NSPACEU) {
4325 for (value = 0; value < 256; value++) {
4326 if (!isSPACE_L1(value)) {
4327 ANYOF_BITMAP_SET(data->start_class, value);
4332 for (value = 0; value < 256; value++) {
4333 if (!isSPACE(value)) {
4334 ANYOF_BITMAP_SET(data->start_class, value);
4341 if (flags & SCF_DO_STCLASS_AND) {
4342 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4343 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4344 for (value = 0; value < 256; value++)
4345 if (!isDIGIT(value))
4346 ANYOF_BITMAP_CLEAR(data->start_class, value);
4350 if (data->start_class->flags & ANYOF_LOCALE)
4351 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
4352 for (value = 0; value < 256; value++)
4354 ANYOF_BITMAP_SET(data->start_class, value);
4358 if (flags & SCF_DO_STCLASS_AND) {
4359 if (!(data->start_class->flags & ANYOF_LOCALE))
4360 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
4361 for (value = 0; value < 256; value++)
4363 ANYOF_BITMAP_CLEAR(data->start_class, value);
4366 if (data->start_class->flags & ANYOF_LOCALE)
4367 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
4368 for (value = 0; value < 256; value++)
4369 if (!isDIGIT(value))
4370 ANYOF_BITMAP_SET(data->start_class, value);
4373 CASE_SYNST_FNC(VERTWS);
4374 CASE_SYNST_FNC(HORIZWS);
4377 if (flags & SCF_DO_STCLASS_OR)
4378 cl_and(data->start_class, and_withp);
4379 flags &= ~SCF_DO_STCLASS;
4382 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4383 data->flags |= (OP(scan) == MEOL
4386 SCAN_COMMIT(pRExC_state, data, minlenp);
4389 else if ( PL_regkind[OP(scan)] == BRANCHJ
4390 /* Lookbehind, or need to calculate parens/evals/stclass: */
4391 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4392 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4393 if ( OP(scan) == UNLESSM &&
4395 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4396 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4399 regnode *upto= regnext(scan);
4401 SV * const mysv_val=sv_newmortal();
4402 DEBUG_STUDYDATA("OPFAIL",data,depth);
4404 /*DEBUG_PARSE_MSG("opfail");*/
4405 regprop(RExC_rx, mysv_val, upto);
4406 PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4407 SvPV_nolen_const(mysv_val),
4408 (IV)REG_NODE_NUM(upto),
4413 NEXT_OFF(scan) = upto - scan;
4414 for (opt= scan + 1; opt < upto ; opt++)
4415 OP(opt) = OPTIMIZED;
4419 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4420 || OP(scan) == UNLESSM )
4422 /* Negative Lookahead/lookbehind
4423 In this case we can't do fixed string optimisation.
4426 I32 deltanext, minnext, fake = 0;
4428 struct regnode_charclass_class intrnl;
4431 data_fake.flags = 0;
4433 data_fake.whilem_c = data->whilem_c;
4434 data_fake.last_closep = data->last_closep;
4437 data_fake.last_closep = &fake;
4438 data_fake.pos_delta = delta;
4439 if ( flags & SCF_DO_STCLASS && !scan->flags
4440 && OP(scan) == IFMATCH ) { /* Lookahead */
4441 cl_init(pRExC_state, &intrnl);
4442 data_fake.start_class = &intrnl;
4443 f |= SCF_DO_STCLASS_AND;
4445 if (flags & SCF_WHILEM_VISITED_POS)
4446 f |= SCF_WHILEM_VISITED_POS;
4447 next = regnext(scan);
4448 nscan = NEXTOPER(NEXTOPER(scan));
4449 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4450 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4453 FAIL("Variable length lookbehind not implemented");
4455 else if (minnext > (I32)U8_MAX) {
4456 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4458 scan->flags = (U8)minnext;
4461 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4463 if (data_fake.flags & SF_HAS_EVAL)
4464 data->flags |= SF_HAS_EVAL;
4465 data->whilem_c = data_fake.whilem_c;
4467 if (f & SCF_DO_STCLASS_AND) {
4468 if (flags & SCF_DO_STCLASS_OR) {
4469 /* OR before, AND after: ideally we would recurse with
4470 * data_fake to get the AND applied by study of the
4471 * remainder of the pattern, and then derecurse;
4472 * *** HACK *** for now just treat as "no information".
4473 * See [perl #56690].
4475 cl_init(pRExC_state, data->start_class);
4477 /* AND before and after: combine and continue */
4478 const int was = (data->start_class->flags & ANYOF_EOS);
4480 cl_and(data->start_class, &intrnl);
4482 data->start_class->flags |= ANYOF_EOS;
4486 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4488 /* Positive Lookahead/lookbehind
4489 In this case we can do fixed string optimisation,
4490 but we must be careful about it. Note in the case of
4491 lookbehind the positions will be offset by the minimum
4492 length of the pattern, something we won't know about
4493 until after the recurse.
4495 I32 deltanext, fake = 0;
4497 struct regnode_charclass_class intrnl;
4499 /* We use SAVEFREEPV so that when the full compile
4500 is finished perl will clean up the allocated
4501 minlens when it's all done. This way we don't
4502 have to worry about freeing them when we know
4503 they wont be used, which would be a pain.
4506 Newx( minnextp, 1, I32 );
4507 SAVEFREEPV(minnextp);
4510 StructCopy(data, &data_fake, scan_data_t);
4511 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4514 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
4515 data_fake.last_found=newSVsv(data->last_found);
4519 data_fake.last_closep = &fake;
4520 data_fake.flags = 0;
4521 data_fake.pos_delta = delta;
4523 data_fake.flags |= SF_IS_INF;
4524 if ( flags & SCF_DO_STCLASS && !scan->flags
4525 && OP(scan) == IFMATCH ) { /* Lookahead */
4526 cl_init(pRExC_state, &intrnl);
4527 data_fake.start_class = &intrnl;
4528 f |= SCF_DO_STCLASS_AND;
4530 if (flags & SCF_WHILEM_VISITED_POS)
4531 f |= SCF_WHILEM_VISITED_POS;
4532 next = regnext(scan);
4533 nscan = NEXTOPER(NEXTOPER(scan));
4535 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4536 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4539 FAIL("Variable length lookbehind not implemented");
4541 else if (*minnextp > (I32)U8_MAX) {
4542 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
4544 scan->flags = (U8)*minnextp;
4549 if (f & SCF_DO_STCLASS_AND) {
4550 const int was = (data->start_class->flags & ANYOF_EOS);
4552 cl_and(data->start_class, &intrnl);
4554 data->start_class->flags |= ANYOF_EOS;
4557 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4559 if (data_fake.flags & SF_HAS_EVAL)
4560 data->flags |= SF_HAS_EVAL;
4561 data->whilem_c = data_fake.whilem_c;
4562 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4563 if (RExC_rx->minlen<*minnextp)
4564 RExC_rx->minlen=*minnextp;
4565 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
4566 SvREFCNT_dec(data_fake.last_found);
4568 if ( data_fake.minlen_fixed != minlenp )
4570 data->offset_fixed= data_fake.offset_fixed;
4571 data->minlen_fixed= data_fake.minlen_fixed;
4572 data->lookbehind_fixed+= scan->flags;
4574 if ( data_fake.minlen_float != minlenp )
4576 data->minlen_float= data_fake.minlen_float;
4577 data->offset_float_min=data_fake.offset_float_min;
4578 data->offset_float_max=data_fake.offset_float_max;
4579 data->lookbehind_float+= scan->flags;
4586 else if (OP(scan) == OPEN) {
4587 if (stopparen != (I32)ARG(scan))
4590 else if (OP(scan) == CLOSE) {
4591 if (stopparen == (I32)ARG(scan)) {
4594 if ((I32)ARG(scan) == is_par) {
4595 next = regnext(scan);
4597 if ( next && (OP(next) != WHILEM) && next < last)
4598 is_par = 0; /* Disable optimization */
4601 *(data->last_closep) = ARG(scan);
4603 else if (OP(scan) == EVAL) {
4605 data->flags |= SF_HAS_EVAL;
4607 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4608 if (flags & SCF_DO_SUBSTR) {
4609 SCAN_COMMIT(pRExC_state,data,minlenp);
4610 flags &= ~SCF_DO_SUBSTR;
4612 if (data && OP(scan)==ACCEPT) {
4613 data->flags |= SCF_SEEN_ACCEPT;
4618 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4620 if (flags & SCF_DO_SUBSTR) {
4621 SCAN_COMMIT(pRExC_state,data,minlenp);
4622 data->longest = &(data->longest_float);
4624 is_inf = is_inf_internal = 1;
4625 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4626 cl_anything(pRExC_state, data->start_class);
4627 flags &= ~SCF_DO_STCLASS;
4629 else if (OP(scan) == GPOS) {
4630 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
4631 !(delta || is_inf || (data && data->pos_delta)))
4633 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4634 RExC_rx->extflags |= RXf_ANCH_GPOS;
4635 if (RExC_rx->gofs < (U32)min)
4636 RExC_rx->gofs = min;
4638 RExC_rx->extflags |= RXf_GPOS_FLOAT;
4642 #ifdef TRIE_STUDY_OPT
4643 #ifdef FULL_TRIE_STUDY
4644 else if (PL_regkind[OP(scan)] == TRIE) {
4645 /* NOTE - There is similar code to this block above for handling
4646 BRANCH nodes on the initial study. If you change stuff here
4648 regnode *trie_node= scan;
4649 regnode *tail= regnext(scan);
4650 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4651 I32 max1 = 0, min1 = I32_MAX;
4652 struct regnode_charclass_class accum;
4654 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
4655 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
4656 if (flags & SCF_DO_STCLASS)
4657 cl_init_zero(pRExC_state, &accum);
4663 const regnode *nextbranch= NULL;
4666 for ( word=1 ; word <= trie->wordcount ; word++)
4668 I32 deltanext=0, minnext=0, f = 0, fake;
4669 struct regnode_charclass_class this_class;
4671 data_fake.flags = 0;
4673 data_fake.whilem_c = data->whilem_c;
4674 data_fake.last_closep = data->last_closep;
4677 data_fake.last_closep = &fake;
4678 data_fake.pos_delta = delta;
4679 if (flags & SCF_DO_STCLASS) {
4680 cl_init(pRExC_state, &this_class);
4681 data_fake.start_class = &this_class;
4682 f = SCF_DO_STCLASS_AND;
4684 if (flags & SCF_WHILEM_VISITED_POS)
4685 f |= SCF_WHILEM_VISITED_POS;
4687 if (trie->jump[word]) {
4689 nextbranch = trie_node + trie->jump[0];
4690 scan= trie_node + trie->jump[word];
4691 /* We go from the jump point to the branch that follows
4692 it. Note this means we need the vestigal unused branches
4693 even though they arent otherwise used.
4695 minnext = study_chunk(pRExC_state, &scan, minlenp,
4696 &deltanext, (regnode *)nextbranch, &data_fake,
4697 stopparen, recursed, NULL, f,depth+1);
4699 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4700 nextbranch= regnext((regnode*)nextbranch);
4702 if (min1 > (I32)(minnext + trie->minlen))
4703 min1 = minnext + trie->minlen;
4704 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4705 max1 = minnext + deltanext + trie->maxlen;
4706 if (deltanext == I32_MAX)
4707 is_inf = is_inf_internal = 1;
4709 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4711 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4712 if ( stopmin > min + min1)
4713 stopmin = min + min1;
4714 flags &= ~SCF_DO_SUBSTR;
4716 data->flags |= SCF_SEEN_ACCEPT;
4719 if (data_fake.flags & SF_HAS_EVAL)
4720 data->flags |= SF_HAS_EVAL;
4721 data->whilem_c = data_fake.whilem_c;
4723 if (flags & SCF_DO_STCLASS)
4724 cl_or(pRExC_state, &accum, &this_class);
4727 if (flags & SCF_DO_SUBSTR) {
4728 data->pos_min += min1;
4729 data->pos_delta += max1 - min1;
4730 if (max1 != min1 || is_inf)
4731 data->longest = &(data->longest_float);
4734 delta += max1 - min1;
4735 if (flags & SCF_DO_STCLASS_OR) {
4736 cl_or(pRExC_state, data->start_class, &accum);
4738 cl_and(data->start_class, and_withp);
4739 flags &= ~SCF_DO_STCLASS;
4742 else if (flags & SCF_DO_STCLASS_AND) {
4744 cl_and(data->start_class, &accum);
4745 flags &= ~SCF_DO_STCLASS;
4748 /* Switch to OR mode: cache the old value of
4749 * data->start_class */
4751 StructCopy(data->start_class, and_withp,
4752 struct regnode_charclass_class);
4753 flags &= ~SCF_DO_STCLASS_AND;
4754 StructCopy(&accum, data->start_class,
4755 struct regnode_charclass_class);
4756 flags |= SCF_DO_STCLASS_OR;
4757 data->start_class->flags |= ANYOF_EOS;
4764 else if (PL_regkind[OP(scan)] == TRIE) {
4765 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
4768 min += trie->minlen;
4769 delta += (trie->maxlen - trie->minlen);
4770 flags &= ~SCF_DO_STCLASS; /* xxx */
4771 if (flags & SCF_DO_SUBSTR) {
4772 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4773 data->pos_min += trie->minlen;
4774 data->pos_delta += (trie->maxlen - trie->minlen);
4775 if (trie->maxlen != trie->minlen)
4776 data->longest = &(data->longest_float);
4778 if (trie->jump) /* no more substrings -- for now /grr*/
4779 flags &= ~SCF_DO_SUBSTR;
4781 #endif /* old or new */
4782 #endif /* TRIE_STUDY_OPT */
4784 /* Else: zero-length, ignore. */
4785 scan = regnext(scan);
4790 stopparen = frame->stop;
4791 frame = frame->prev;
4792 goto fake_study_recurse;
4797 DEBUG_STUDYDATA("pre-fin:",data,depth);
4800 *deltap = is_inf_internal ? I32_MAX : delta;
4801 if (flags & SCF_DO_SUBSTR && is_inf)
4802 data->pos_delta = I32_MAX - data->pos_min;
4803 if (is_par > (I32)U8_MAX)
4805 if (is_par && pars==1 && data) {
4806 data->flags |= SF_IN_PAR;
4807 data->flags &= ~SF_HAS_PAR;
4809 else if (pars && data) {
4810 data->flags |= SF_HAS_PAR;
4811 data->flags &= ~SF_IN_PAR;
4813 if (flags & SCF_DO_STCLASS_OR)
4814 cl_and(data->start_class, and_withp);
4815 if (flags & SCF_TRIE_RESTUDY)
4816 data->flags |= SCF_TRIE_RESTUDY;
4818 DEBUG_STUDYDATA("post-fin:",data,depth);
4820 return min < stopmin ? min : stopmin;
4824 S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
4826 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4828 PERL_ARGS_ASSERT_ADD_DATA;
4830 Renewc(RExC_rxi->data,
4831 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4832 char, struct reg_data);
4834 Renew(RExC_rxi->data->what, count + n, U8);
4836 Newx(RExC_rxi->data->what, n, U8);
4837 RExC_rxi->data->count = count + n;
4838 Copy(s, RExC_rxi->data->what + count, n, U8);
4842 /*XXX: todo make this not included in a non debugging perl */
4843 #ifndef PERL_IN_XSUB_RE
4845 Perl_reginitcolors(pTHX)
4848 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
4850 char *t = savepv(s);
4854 t = strchr(t, '\t');
4860 PL_colors[i] = t = (char *)"";
4865 PL_colors[i++] = (char *)"";
4872 #ifdef TRIE_STUDY_OPT
4873 #define CHECK_RESTUDY_GOTO \
4875 (data.flags & SCF_TRIE_RESTUDY) \
4879 #define CHECK_RESTUDY_GOTO
4883 * pregcomp - compile a regular expression into internal code
4885 * Decides which engine's compiler to call based on the hint currently in
4889 #ifndef PERL_IN_XSUB_RE
4891 /* return the currently in-scope regex engine (or the default if none) */
4893 regexp_engine const *
4894 Perl_current_re_engine(pTHX)
4898 if (IN_PERL_COMPILETIME) {
4899 HV * const table = GvHV(PL_hintgv);
4903 return &PL_core_reg_engine;
4904 ptr = hv_fetchs(table, "regcomp", FALSE);
4905 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
4906 return &PL_core_reg_engine;
4907 return INT2PTR(regexp_engine*,SvIV(*ptr));
4911 if (!PL_curcop->cop_hints_hash)
4912 return &PL_core_reg_engine;
4913 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
4914 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
4915 return &PL_core_reg_engine;
4916 return INT2PTR(regexp_engine*,SvIV(ptr));
4922 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
4925 regexp_engine const *eng = current_re_engine();
4926 GET_RE_DEBUG_FLAGS_DECL;
4928 PERL_ARGS_ASSERT_PREGCOMP;
4930 /* Dispatch a request to compile a regexp to correct regexp engine. */
4932 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
4935 return CALLREGCOMP_ENG(eng, pattern, flags);
4939 /* public(ish) entry point for the perl core's own regex compiling code.
4940 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
4941 * pattern rather than a list of OPs, and uses the internal engine rather
4942 * than the current one */
4945 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
4947 SV *pat = pattern; /* defeat constness! */
4948 PERL_ARGS_ASSERT_RE_COMPILE;
4949 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
4950 #ifdef PERL_IN_XSUB_RE
4953 &PL_core_reg_engine,
4955 NULL, NULL, rx_flags, 0);
4958 /* see if there are any run-time code blocks in the pattern.
4959 * False positives are allowed */
4962 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
4963 U32 pm_flags, char *pat, STRLEN plen)
4968 /* avoid infinitely recursing when we recompile the pattern parcelled up
4969 * as qr'...'. A single constant qr// string can't have have any
4970 * run-time component in it, and thus, no runtime code. (A non-qr
4971 * string, however, can, e.g. $x =~ '(?{})') */
4972 if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
4975 for (s = 0; s < plen; s++) {
4976 if (n < pRExC_state->num_code_blocks
4977 && s == pRExC_state->code_blocks[n].start)
4979 s = pRExC_state->code_blocks[n].end;
4983 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
4985 if (pat[s] == '(' && pat[s+1] == '?' &&
4986 (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
4993 /* Handle run-time code blocks. We will already have compiled any direct
4994 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
4995 * copy of it, but with any literal code blocks blanked out and
4996 * appropriate chars escaped; then feed it into
4998 * eval "qr'modified_pattern'"
5002 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5006 * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
5008 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5009 * and merge them with any code blocks of the original regexp.
5011 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5012 * instead, just save the qr and return FALSE; this tells our caller that
5013 * the original pattern needs upgrading to utf8.
5017 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5018 char *pat, STRLEN plen)
5022 GET_RE_DEBUG_FLAGS_DECL;
5024 if (pRExC_state->runtime_code_qr) {
5025 /* this is the second time we've been called; this should
5026 * only happen if the main pattern got upgraded to utf8
5027 * during compilation; re-use the qr we compiled first time
5028 * round (which should be utf8 too)
5030 qr = pRExC_state->runtime_code_qr;
5031 pRExC_state->runtime_code_qr = NULL;
5032 assert(RExC_utf8 && SvUTF8(qr));
5038 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5042 /* determine how many extra chars we need for ' and \ escaping */
5043 for (s = 0; s < plen; s++) {
5044 if (pat[s] == '\'' || pat[s] == '\\')
5048 Newx(newpat, newlen, char);
5050 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5052 for (s = 0; s < plen; s++) {
5053 if (n < pRExC_state->num_code_blocks
5054 && s == pRExC_state->code_blocks[n].start)
5056 /* blank out literal code block */
5057 assert(pat[s] == '(');
5058 while (s <= pRExC_state->code_blocks[n].end) {
5066 if (pat[s] == '\'' || pat[s] == '\\')
5071 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5075 PerlIO_printf(Perl_debug_log,
5076 "%sre-parsing pattern for runtime code:%s %s\n",
5077 PL_colors[4],PL_colors[5],newpat);
5080 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5086 PUSHSTACKi(PERLSI_REQUIRE);
5087 /* this causes the toker to collapse \\ into \ when parsing
5088 * qr''; normally only q'' does this. It also alters hints
5090 PL_reg_state.re_reparsing = TRUE;
5091 eval_sv(sv, G_SCALAR);
5097 Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
5098 assert(SvROK(qr_ref));
5100 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5101 /* the leaving below frees the tmp qr_ref.
5102 * Give qr a life of its own */
5110 if (!RExC_utf8 && SvUTF8(qr)) {
5111 /* first time through; the pattern got upgraded; save the
5112 * qr for the next time through */
5113 assert(!pRExC_state->runtime_code_qr);
5114 pRExC_state->runtime_code_qr = qr;
5119 /* extract any code blocks within the returned qr// */
5122 /* merge the main (r1) and run-time (r2) code blocks into one */
5124 RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
5125 struct reg_code_block *new_block, *dst;
5126 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
5129 if (!r2->num_code_blocks) /* we guessed wrong */
5133 r1->num_code_blocks + r2->num_code_blocks,
5134 struct reg_code_block);
5137 while ( i1 < r1->num_code_blocks
5138 || i2 < r2->num_code_blocks)
5140 struct reg_code_block *src;
5143 if (i1 == r1->num_code_blocks) {
5144 src = &r2->code_blocks[i2++];
5147 else if (i2 == r2->num_code_blocks)
5148 src = &r1->code_blocks[i1++];
5149 else if ( r1->code_blocks[i1].start
5150 < r2->code_blocks[i2].start)
5152 src = &r1->code_blocks[i1++];
5153 assert(src->end < r2->code_blocks[i2].start);
5156 assert( r1->code_blocks[i1].start
5157 > r2->code_blocks[i2].start);
5158 src = &r2->code_blocks[i2++];
5160 assert(src->end < r1->code_blocks[i1].start);
5163 assert(pat[src->start] == '(');
5164 assert(pat[src->end] == ')');
5165 dst->start = src->start;
5166 dst->end = src->end;
5167 dst->block = src->block;
5168 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
5172 r1->num_code_blocks += r2->num_code_blocks;
5173 Safefree(r1->code_blocks);
5174 r1->code_blocks = new_block;
5183 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
5185 /* This is the common code for setting up the floating and fixed length
5186 * string data extracted from Perlre_op_compile() below. Returns a boolean
5187 * as to whether succeeded or not */
5191 if (! (longest_length
5192 || (eol /* Can't have SEOL and MULTI */
5193 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
5195 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
5196 || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
5201 /* copy the information about the longest from the reg_scan_data
5202 over to the program. */
5203 if (SvUTF8(sv_longest)) {
5204 *rx_utf8 = sv_longest;
5207 *rx_substr = sv_longest;
5210 /* end_shift is how many chars that must be matched that
5211 follow this item. We calculate it ahead of time as once the
5212 lookbehind offset is added in we lose the ability to correctly
5214 ml = minlen ? *(minlen) : (I32)longest_length;
5215 *rx_end_shift = ml - offset
5216 - longest_length + (SvTAIL(sv_longest) != 0)
5219 t = (eol/* Can't have SEOL and MULTI */
5220 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
5221 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
5227 * Perl_re_op_compile - the perl internal RE engine's function to compile a
5228 * regular expression into internal code.
5229 * The pattern may be passed either as:
5230 * a list of SVs (patternp plus pat_count)
5231 * a list of OPs (expr)
5232 * If both are passed, the SV list is used, but the OP list indicates
5233 * which SVs are actually pre-compiled code blocks
5235 * The SVs in the list have magic and qr overloading applied to them (and
5236 * the list may be modified in-place with replacement SVs in the latter
5239 * If the pattern hasn't changed from old_re, then old_re will be
5242 * eng is the current engine. If that engine has an op_comp method, then
5243 * handle directly (i.e. we assume that op_comp was us); otherwise, just
5244 * do the initial concatenation of arguments and pass on to the external
5247 * If is_bare_re is not null, set it to a boolean indicating whether the
5248 * arg list reduced (after overloading) to a single bare regex which has
5249 * been returned (i.e. /$qr/).
5251 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
5253 * pm_flags contains the PMf_* flags, typically based on those from the
5254 * pm_flags field of the related PMOP. Currently we're only interested in
5255 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
5257 * We can't allocate space until we know how big the compiled form will be,
5258 * but we can't compile it (and thus know how big it is) until we've got a
5259 * place to put the code. So we cheat: we compile it twice, once with code
5260 * generation turned off and size counting turned on, and once "for real".
5261 * This also means that we don't allocate space until we are sure that the
5262 * thing really will compile successfully, and we never have to move the
5263 * code and thus invalidate pointers into it. (Note that it has to be in
5264 * one piece because free() must be able to free it all.) [NB: not true in perl]
5266 * Beware that the optimization-preparation code in here knows about some
5267 * of the structure of the compiled regexp. [I'll say.]
5271 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
5272 OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
5273 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
5278 regexp_internal *ri;
5288 /* these are all flags - maybe they should be turned
5289 * into a single int with different bit masks */
5290 I32 sawlookahead = 0;
5293 bool used_setjump = FALSE;
5294 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
5295 bool code_is_utf8 = 0;
5296 bool VOL recompile = 0;
5297 bool runtime_code = 0;
5301 RExC_state_t RExC_state;
5302 RExC_state_t * const pRExC_state = &RExC_state;
5303 #ifdef TRIE_STUDY_OPT
5305 RExC_state_t copyRExC_state;
5307 GET_RE_DEBUG_FLAGS_DECL;
5309 PERL_ARGS_ASSERT_RE_OP_COMPILE;
5311 DEBUG_r(if (!PL_colorset) reginitcolors());
5313 #ifndef PERL_IN_XSUB_RE
5314 /* Initialize these here instead of as-needed, as is quick and avoids
5315 * having to test them each time otherwise */
5316 if (! PL_AboveLatin1) {
5317 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
5318 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
5319 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
5321 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
5322 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
5324 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
5325 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
5327 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
5328 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
5330 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
5332 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
5333 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
5335 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
5337 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
5338 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
5340 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
5341 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
5343 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
5344 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
5346 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
5347 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
5349 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
5350 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
5352 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
5353 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
5355 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
5356 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
5358 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
5360 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
5361 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
5363 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
5364 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
5366 PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
5370 pRExC_state->code_blocks = NULL;
5371 pRExC_state->num_code_blocks = 0;
5374 *is_bare_re = FALSE;
5376 if (expr && (expr->op_type == OP_LIST ||
5377 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
5379 /* is the source UTF8, and how many code blocks are there? */
5383 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5384 if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
5386 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
5387 /* count of DO blocks */
5391 pRExC_state->num_code_blocks = ncode;
5392 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
5397 /* handle a list of SVs */
5401 /* apply magic and RE overloading to each arg */
5402 for (svp = patternp; svp < patternp + pat_count; svp++) {
5405 if (SvROK(rx) && SvAMAGIC(rx)) {
5406 SV *sv = AMG_CALLunary(rx, regexp_amg);
5410 if (SvTYPE(sv) != SVt_REGEXP)
5411 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5417 if (pat_count > 1) {
5418 /* concat multiple args and find any code block indexes */
5423 STRLEN orig_patlen = 0;
5425 if (pRExC_state->num_code_blocks) {
5426 o = cLISTOPx(expr)->op_first;
5427 assert(o->op_type == OP_PUSHMARK);
5431 pat = newSVpvn("", 0);
5434 /* determine if the pattern is going to be utf8 (needed
5435 * in advance to align code block indices correctly).
5436 * XXX This could fail to be detected for an arg with
5437 * overloading but not concat overloading; but the main effect
5438 * in this obscure case is to need a 'use re eval' for a
5439 * literal code block */
5440 for (svp = patternp; svp < patternp + pat_count; svp++) {
5447 for (svp = patternp; svp < patternp + pat_count; svp++) {
5448 SV *sv, *msv = *svp;
5452 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5453 assert(n < pRExC_state->num_code_blocks);
5454 pRExC_state->code_blocks[n].start = SvCUR(pat);
5455 pRExC_state->code_blocks[n].block = o;
5456 pRExC_state->code_blocks[n].src_regex = NULL;
5459 o = o->op_sibling; /* skip CONST */
5465 if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5466 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5469 /* overloading involved: all bets are off over literal
5470 * code. Pretend we haven't seen it */
5471 pRExC_state->num_code_blocks -= n;
5477 while (SvAMAGIC(msv)
5478 && (sv = AMG_CALLunary(msv, string_amg))
5482 && SvRV(msv) == SvRV(sv))
5487 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5489 orig_patlen = SvCUR(pat);
5490 sv_catsv_nomg(pat, msv);
5493 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5496 /* extract any code blocks within any embedded qr//'s */
5497 if (rx && SvTYPE(rx) == SVt_REGEXP
5498 && RX_ENGINE((REGEXP*)rx)->op_comp)
5501 RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
5502 if (ri->num_code_blocks) {
5504 /* the presence of an embedded qr// with code means
5505 * we should always recompile: the text of the
5506 * qr// may not have changed, but it may be a
5507 * different closure than last time */
5509 Renew(pRExC_state->code_blocks,
5510 pRExC_state->num_code_blocks + ri->num_code_blocks,
5511 struct reg_code_block);
5512 pRExC_state->num_code_blocks += ri->num_code_blocks;
5513 for (i=0; i < ri->num_code_blocks; i++) {
5514 struct reg_code_block *src, *dst;
5515 STRLEN offset = orig_patlen
5516 + ((struct regexp *)SvANY(rx))->pre_prefix;
5517 assert(n < pRExC_state->num_code_blocks);
5518 src = &ri->code_blocks[i];
5519 dst = &pRExC_state->code_blocks[n];
5520 dst->start = src->start + offset;
5521 dst->end = src->end + offset;
5522 dst->block = src->block;
5523 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5537 while (SvAMAGIC(pat)
5538 && (sv = AMG_CALLunary(pat, string_amg))
5546 /* handle bare regex: foo =~ $re */
5551 if (SvTYPE(re) == SVt_REGEXP) {
5555 Safefree(pRExC_state->code_blocks);
5561 /* not a list of SVs, so must be a list of OPs */
5563 if (expr->op_type == OP_LIST) {
5568 pat = newSVpvn("", 0);
5573 /* given a list of CONSTs and DO blocks in expr, append all
5574 * the CONSTs to pat, and record the start and end of each
5575 * code block in code_blocks[] (each DO{} op is followed by an
5576 * OP_CONST containing the corresponding literal '(?{...})
5579 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
5580 if (o->op_type == OP_CONST) {
5581 sv_catsv(pat, cSVOPo_sv);
5583 pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
5587 else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
5588 assert(i+1 < pRExC_state->num_code_blocks);
5589 pRExC_state->code_blocks[++i].start = SvCUR(pat);
5590 pRExC_state->code_blocks[i].block = o;
5591 pRExC_state->code_blocks[i].src_regex = NULL;
5597 assert(expr->op_type == OP_CONST);
5598 pat = cSVOPx_sv(expr);
5602 exp = SvPV_nomg(pat, plen);
5604 if (!eng->op_comp) {
5605 if ((SvUTF8(pat) && IN_BYTES)
5606 || SvGMAGICAL(pat) || SvAMAGIC(pat))
5608 /* make a temporary copy; either to convert to bytes,
5609 * or to avoid repeating get-magic / overloaded stringify */
5610 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
5611 (IN_BYTES ? 0 : SvUTF8(pat)));
5613 Safefree(pRExC_state->code_blocks);
5614 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
5617 /* ignore the utf8ness if the pattern is 0 length */
5618 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
5619 RExC_uni_semantics = 0;
5620 RExC_contains_locale = 0;
5621 pRExC_state->runtime_code_qr = NULL;
5623 /****************** LONG JUMP TARGET HERE***********************/
5624 /* Longjmp back to here if have to switch in midstream to utf8 */
5625 if (! RExC_orig_utf8) {
5626 JMPENV_PUSH(jump_ret);
5627 used_setjump = TRUE;
5630 if (jump_ret == 0) { /* First time through */
5634 SV *dsv= sv_newmortal();
5635 RE_PV_QUOTED_DECL(s, RExC_utf8,
5636 dsv, exp, plen, 60);
5637 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
5638 PL_colors[4],PL_colors[5],s);
5641 else { /* longjumped back */
5644 STRLEN s = 0, d = 0;
5647 /* If the cause for the longjmp was other than changing to utf8, pop
5648 * our own setjmp, and longjmp to the correct handler */
5649 if (jump_ret != UTF8_LONGJMP) {
5651 JMPENV_JUMP(jump_ret);
5656 /* It's possible to write a regexp in ascii that represents Unicode
5657 codepoints outside of the byte range, such as via \x{100}. If we
5658 detect such a sequence we have to convert the entire pattern to utf8
5659 and then recompile, as our sizing calculation will have been based
5660 on 1 byte == 1 character, but we will need to use utf8 to encode
5661 at least some part of the pattern, and therefore must convert the whole
5664 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5665 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5667 /* upgrade pattern to UTF8, and if there are code blocks,
5668 * recalculate the indices.
5669 * This is essentially an unrolled Perl_bytes_to_utf8() */
5671 src = (U8*)SvPV_nomg(pat, plen);
5672 Newx(dst, plen * 2 + 1, U8);
5675 const UV uv = NATIVE_TO_ASCII(src[s]);
5676 if (UNI_IS_INVARIANT(uv))
5677 dst[d] = (U8)UTF_TO_NATIVE(uv);
5679 dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
5680 dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
5682 if (n < pRExC_state->num_code_blocks) {
5683 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5684 pRExC_state->code_blocks[n].start = d;
5685 assert(dst[d] == '(');
5688 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5689 pRExC_state->code_blocks[n].end = d;
5690 assert(dst[d] == ')');
5703 RExC_orig_utf8 = RExC_utf8 = 1;
5706 /* return old regex if pattern hasn't changed */
5710 && !!RX_UTF8(old_re) == !!RExC_utf8
5711 && RX_PRECOMP(old_re)
5712 && RX_PRELEN(old_re) == plen
5713 && memEQ(RX_PRECOMP(old_re), exp, plen))
5715 /* with runtime code, always recompile */
5716 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5718 if (!runtime_code) {
5722 Safefree(pRExC_state->code_blocks);
5726 else if ((pm_flags & PMf_USE_RE_EVAL)
5727 /* this second condition covers the non-regex literal case,
5728 * i.e. $foo =~ '(?{})'. */
5729 || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
5730 && (PL_hints & HINT_RE_EVAL))
5732 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
5735 #ifdef TRIE_STUDY_OPT
5739 rx_flags = orig_rx_flags;
5741 if (initial_charset == REGEX_LOCALE_CHARSET) {
5742 RExC_contains_locale = 1;
5744 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
5746 /* Set to use unicode semantics if the pattern is in utf8 and has the
5747 * 'depends' charset specified, as it means unicode when utf8 */
5748 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5752 RExC_flags = rx_flags;
5753 RExC_pm_flags = pm_flags;
5756 if (PL_tainting && PL_tainted)
5757 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
5759 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
5760 /* whoops, we have a non-utf8 pattern, whilst run-time code
5761 * got compiled as utf8. Try again with a utf8 pattern */
5762 JMPENV_JUMP(UTF8_LONGJMP);
5765 assert(!pRExC_state->runtime_code_qr);
5770 RExC_in_lookbehind = 0;
5771 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
5773 RExC_override_recoding = 0;
5774 RExC_in_multi_char_class = 0;
5776 /* First pass: determine size, legality. */
5784 RExC_emit = &PL_regdummy;
5785 RExC_whilem_seen = 0;
5786 RExC_open_parens = NULL;
5787 RExC_close_parens = NULL;
5789 RExC_paren_names = NULL;
5791 RExC_paren_name_list = NULL;
5793 RExC_recurse = NULL;
5794 RExC_recurse_count = 0;
5795 pRExC_state->code_index = 0;
5797 #if 0 /* REGC() is (currently) a NOP at the first pass.
5798 * Clever compilers notice this and complain. --jhi */
5799 REGC((U8)REG_MAGIC, (char*)RExC_emit);
5802 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
5804 RExC_lastparse=NULL;
5806 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5807 RExC_precomp = NULL;
5808 Safefree(pRExC_state->code_blocks);
5812 /* Here, finished first pass. Get rid of any added setjmp */
5818 PerlIO_printf(Perl_debug_log,
5819 "Required size %"IVdf" nodes\n"
5820 "Starting second pass (creation)\n",
5823 RExC_lastparse=NULL;
5826 /* The first pass could have found things that force Unicode semantics */
5827 if ((RExC_utf8 || RExC_uni_semantics)
5828 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
5830 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
5833 /* Small enough for pointer-storage convention?
5834 If extralen==0, this means that we will not need long jumps. */
5835 if (RExC_size >= 0x10000L && RExC_extralen)
5836 RExC_size += RExC_extralen;
5839 if (RExC_whilem_seen > 15)
5840 RExC_whilem_seen = 15;
5842 /* Allocate space and zero-initialize. Note, the two step process
5843 of zeroing when in debug mode, thus anything assigned has to
5844 happen after that */
5845 rx = (REGEXP*) newSV_type(SVt_REGEXP);
5846 r = (struct regexp*)SvANY(rx);
5847 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5848 char, regexp_internal);
5849 if ( r == NULL || ri == NULL )
5850 FAIL("Regexp out of space");
5852 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
5853 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
5855 /* bulk initialize base fields with 0. */
5856 Zero(ri, sizeof(regexp_internal), char);
5859 /* non-zero initialization begins here */
5862 r->extflags = rx_flags;
5863 if (pm_flags & PMf_IS_QR) {
5864 ri->code_blocks = pRExC_state->code_blocks;
5865 ri->num_code_blocks = pRExC_state->num_code_blocks;
5868 SAVEFREEPV(pRExC_state->code_blocks);
5871 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
5872 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
5874 /* The caret is output if there are any defaults: if not all the STD
5875 * flags are set, or if no character set specifier is needed */
5877 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5879 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
5880 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5881 >> RXf_PMf_STD_PMMOD_SHIFT);
5882 const char *fptr = STD_PAT_MODS; /*"msix"*/
5884 /* Allocate for the worst case, which is all the std flags are turned
5885 * on. If more precision is desired, we could do a population count of
5886 * the flags set. This could be done with a small lookup table, or by
5887 * shifting, masking and adding, or even, when available, assembly
5888 * language for a machine-language population count.
5889 * We never output a minus, as all those are defaults, so are
5890 * covered by the caret */
5891 const STRLEN wraplen = plen + has_p + has_runon
5892 + has_default /* If needs a caret */
5894 /* If needs a character set specifier */
5895 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
5896 + (sizeof(STD_PAT_MODS) - 1)
5897 + (sizeof("(?:)") - 1);
5899 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
5902 SvFLAGS(rx) |= SVf_UTF8;
5905 /* If a default, cover it using the caret */
5907 *p++= DEFAULT_PAT_MOD;
5911 const char* const name = get_regex_charset_name(r->extflags, &len);
5912 Copy(name, p, len, char);
5916 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
5919 while((ch = *fptr++)) {
5927 Copy(RExC_precomp, p, plen, char);
5928 assert ((RX_WRAPPED(rx) - p) < 16);
5929 r->pre_prefix = p - RX_WRAPPED(rx);
5935 SvCUR_set(rx, p - SvPVX_const(rx));
5939 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
5941 if (RExC_seen & REG_SEEN_RECURSE) {
5942 Newxz(RExC_open_parens, RExC_npar,regnode *);
5943 SAVEFREEPV(RExC_open_parens);
5944 Newxz(RExC_close_parens,RExC_npar,regnode *);
5945 SAVEFREEPV(RExC_close_parens);
5948 /* Useful during FAIL. */
5949 #ifdef RE_TRACK_PATTERN_OFFSETS
5950 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
5951 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
5952 "%s %"UVuf" bytes for offset annotations.\n",
5953 ri->u.offsets ? "Got" : "Couldn't get",
5954 (UV)((2*RExC_size+1) * sizeof(U32))));
5956 SetProgLen(ri,RExC_size);
5961 /* Second pass: emit code. */
5962 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
5963 RExC_pm_flags = pm_flags;
5968 RExC_emit_start = ri->program;
5969 RExC_emit = ri->program;
5970 RExC_emit_bound = ri->program + RExC_size + 1;
5971 pRExC_state->code_index = 0;
5973 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
5974 if (reg(pRExC_state, 0, &flags,1) == NULL) {
5978 /* XXXX To minimize changes to RE engine we always allocate
5979 3-units-long substrs field. */
5980 Newx(r->substrs, 1, struct reg_substr_data);
5981 if (RExC_recurse_count) {
5982 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5983 SAVEFREEPV(RExC_recurse);
5987 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
5988 Zero(r->substrs, 1, struct reg_substr_data);
5990 #ifdef TRIE_STUDY_OPT
5992 StructCopy(&zero_scan_data, &data, scan_data_t);
5993 copyRExC_state = RExC_state;
5996 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5998 RExC_state = copyRExC_state;
5999 if (seen & REG_TOP_LEVEL_BRANCHES)
6000 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
6002 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
6003 if (data.last_found) {
6004 SvREFCNT_dec(data.longest_fixed);
6005 SvREFCNT_dec(data.longest_float);
6006 SvREFCNT_dec(data.last_found);
6008 StructCopy(&zero_scan_data, &data, scan_data_t);
6011 StructCopy(&zero_scan_data, &data, scan_data_t);
6014 /* Dig out information for optimizations. */
6015 r->extflags = RExC_flags; /* was pm_op */
6016 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6019 SvUTF8_on(rx); /* Unicode in it? */
6020 ri->regstclass = NULL;
6021 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6022 r->intflags |= PREGf_NAUGHTY;
6023 scan = ri->program + 1; /* First BRANCH. */
6025 /* testing for BRANCH here tells us whether there is "must appear"
6026 data in the pattern. If there is then we can use it for optimisations */
6027 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
6029 STRLEN longest_float_length, longest_fixed_length;
6030 struct regnode_charclass_class ch_class; /* pointed to by data */
6032 I32 last_close = 0; /* pointed to by data */
6033 regnode *first= scan;
6034 regnode *first_next= regnext(first);
6036 * Skip introductions and multiplicators >= 1
6037 * so that we can extract the 'meat' of the pattern that must
6038 * match in the large if() sequence following.
6039 * NOTE that EXACT is NOT covered here, as it is normally
6040 * picked up by the optimiser separately.
6042 * This is unfortunate as the optimiser isnt handling lookahead
6043 * properly currently.
6046 while ((OP(first) == OPEN && (sawopen = 1)) ||
6047 /* An OR of *one* alternative - should not happen now. */
6048 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6049 /* for now we can't handle lookbehind IFMATCH*/
6050 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6051 (OP(first) == PLUS) ||
6052 (OP(first) == MINMOD) ||
6053 /* An {n,m} with n>0 */
6054 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6055 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6058 * the only op that could be a regnode is PLUS, all the rest
6059 * will be regnode_1 or regnode_2.
6062 if (OP(first) == PLUS)
6065 first += regarglen[OP(first)];
6067 first = NEXTOPER(first);
6068 first_next= regnext(first);
6071 /* Starting-point info. */
6073 DEBUG_PEEP("first:",first,0);
6074 /* Ignore EXACT as we deal with it later. */
6075 if (PL_regkind[OP(first)] == EXACT) {
6076 if (OP(first) == EXACT)
6077 NOOP; /* Empty, get anchored substr later. */
6079 ri->regstclass = first;
6082 else if (PL_regkind[OP(first)] == TRIE &&
6083 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6086 /* this can happen only on restudy */
6087 if ( OP(first) == TRIE ) {
6088 struct regnode_1 *trieop = (struct regnode_1 *)
6089 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6090 StructCopy(first,trieop,struct regnode_1);
6091 trie_op=(regnode *)trieop;
6093 struct regnode_charclass *trieop = (struct regnode_charclass *)
6094 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6095 StructCopy(first,trieop,struct regnode_charclass);
6096 trie_op=(regnode *)trieop;
6099 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6100 ri->regstclass = trie_op;
6103 else if (REGNODE_SIMPLE(OP(first)))
6104 ri->regstclass = first;
6105 else if (PL_regkind[OP(first)] == BOUND ||
6106 PL_regkind[OP(first)] == NBOUND)
6107 ri->regstclass = first;
6108 else if (PL_regkind[OP(first)] == BOL) {
6109 r->extflags |= (OP(first) == MBOL
6111 : (OP(first) == SBOL
6114 first = NEXTOPER(first);
6117 else if (OP(first) == GPOS) {
6118 r->extflags |= RXf_ANCH_GPOS;
6119 first = NEXTOPER(first);
6122 else if ((!sawopen || !RExC_sawback) &&
6123 (OP(first) == STAR &&
6124 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6125 !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
6127 /* turn .* into ^.* with an implied $*=1 */
6129 (OP(NEXTOPER(first)) == REG_ANY)
6132 r->extflags |= type;
6133 r->intflags |= PREGf_IMPLICIT;
6134 first = NEXTOPER(first);
6137 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
6138 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6139 /* x+ must match at the 1st pos of run of x's */
6140 r->intflags |= PREGf_SKIP;
6142 /* Scan is after the zeroth branch, first is atomic matcher. */
6143 #ifdef TRIE_STUDY_OPT
6146 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6147 (IV)(first - scan + 1))
6151 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6152 (IV)(first - scan + 1))
6158 * If there's something expensive in the r.e., find the
6159 * longest literal string that must appear and make it the
6160 * regmust. Resolve ties in favor of later strings, since
6161 * the regstart check works with the beginning of the r.e.
6162 * and avoiding duplication strengthens checking. Not a
6163 * strong reason, but sufficient in the absence of others.
6164 * [Now we resolve ties in favor of the earlier string if
6165 * it happens that c_offset_min has been invalidated, since the
6166 * earlier string may buy us something the later one won't.]
6169 data.longest_fixed = newSVpvs("");
6170 data.longest_float = newSVpvs("");
6171 data.last_found = newSVpvs("");
6172 data.longest = &(data.longest_fixed);
6174 if (!ri->regstclass) {
6175 cl_init(pRExC_state, &ch_class);
6176 data.start_class = &ch_class;
6177 stclass_flag = SCF_DO_STCLASS_AND;
6178 } else /* XXXX Check for BOUND? */
6180 data.last_closep = &last_close;
6182 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
6183 &data, -1, NULL, NULL,
6184 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
6190 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6191 && data.last_start_min == 0 && data.last_end > 0
6192 && !RExC_seen_zerolen
6193 && !(RExC_seen & REG_SEEN_VERBARG)
6194 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
6195 r->extflags |= RXf_CHECK_ALL;
6196 scan_commit(pRExC_state, &data,&minlen,0);
6197 SvREFCNT_dec(data.last_found);
6199 longest_float_length = CHR_SVLEN(data.longest_float);
6201 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6202 && data.offset_fixed == data.offset_float_min
6203 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6204 && S_setup_longest (aTHX_ pRExC_state,
6208 &(r->float_end_shift),
6209 data.lookbehind_float,
6210 data.offset_float_min,
6212 longest_float_length,
6213 data.flags & SF_FL_BEFORE_EOL,
6214 data.flags & SF_FL_BEFORE_MEOL))
6216 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6217 r->float_max_offset = data.offset_float_max;
6218 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
6219 r->float_max_offset -= data.lookbehind_float;
6222 r->float_substr = r->float_utf8 = NULL;
6223 SvREFCNT_dec(data.longest_float);
6224 longest_float_length = 0;
6227 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6229 if (S_setup_longest (aTHX_ pRExC_state,
6231 &(r->anchored_utf8),
6232 &(r->anchored_substr),
6233 &(r->anchored_end_shift),
6234 data.lookbehind_fixed,
6237 longest_fixed_length,
6238 data.flags & SF_FIX_BEFORE_EOL,
6239 data.flags & SF_FIX_BEFORE_MEOL))
6241 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6244 r->anchored_substr = r->anchored_utf8 = NULL;
6245 SvREFCNT_dec(data.longest_fixed);
6246 longest_fixed_length = 0;
6250 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6251 ri->regstclass = NULL;
6253 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6255 && !(data.start_class->flags & ANYOF_EOS)
6256 && !cl_is_anything(data.start_class))
6258 const U32 n = add_data(pRExC_state, 1, "f");
6259 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6261 Newx(RExC_rxi->data->data[n], 1,
6262 struct regnode_charclass_class);
6263 StructCopy(data.start_class,
6264 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6265 struct regnode_charclass_class);
6266 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6267 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6268 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6269 regprop(r, sv, (regnode*)data.start_class);
6270 PerlIO_printf(Perl_debug_log,
6271 "synthetic stclass \"%s\".\n",
6272 SvPVX_const(sv));});
6275 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
6276 if (longest_fixed_length > longest_float_length) {
6277 r->check_end_shift = r->anchored_end_shift;
6278 r->check_substr = r->anchored_substr;
6279 r->check_utf8 = r->anchored_utf8;
6280 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6281 if (r->extflags & RXf_ANCH_SINGLE)
6282 r->extflags |= RXf_NOSCAN;
6285 r->check_end_shift = r->float_end_shift;
6286 r->check_substr = r->float_substr;
6287 r->check_utf8 = r->float_utf8;
6288 r->check_offset_min = r->float_min_offset;
6289 r->check_offset_max = r->float_max_offset;
6291 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
6292 This should be changed ASAP! */
6293 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
6294 r->extflags |= RXf_USE_INTUIT;
6295 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6296 r->extflags |= RXf_INTUIT_TAIL;
6298 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6299 if ( (STRLEN)minlen < longest_float_length )
6300 minlen= longest_float_length;
6301 if ( (STRLEN)minlen < longest_fixed_length )
6302 minlen= longest_fixed_length;
6306 /* Several toplevels. Best we can is to set minlen. */
6308 struct regnode_charclass_class ch_class;
6311 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
6313 scan = ri->program + 1;
6314 cl_init(pRExC_state, &ch_class);
6315 data.start_class = &ch_class;
6316 data.last_closep = &last_close;
6319 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
6320 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
6324 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
6325 = r->float_substr = r->float_utf8 = NULL;
6327 if (!(data.start_class->flags & ANYOF_EOS)
6328 && !cl_is_anything(data.start_class))
6330 const U32 n = add_data(pRExC_state, 1, "f");
6331 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
6333 Newx(RExC_rxi->data->data[n], 1,
6334 struct regnode_charclass_class);
6335 StructCopy(data.start_class,
6336 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
6337 struct regnode_charclass_class);
6338 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6339 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6340 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
6341 regprop(r, sv, (regnode*)data.start_class);
6342 PerlIO_printf(Perl_debug_log,
6343 "synthetic stclass \"%s\".\n",
6344 SvPVX_const(sv));});
6348 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
6349 the "real" pattern. */
6351 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
6352 (IV)minlen, (IV)r->minlen);
6354 r->minlenret = minlen;
6355 if (r->minlen < minlen)
6358 if (RExC_seen & REG_SEEN_GPOS)
6359 r->extflags |= RXf_GPOS_SEEN;
6360 if (RExC_seen & REG_SEEN_LOOKBEHIND)
6361 r->extflags |= RXf_LOOKBEHIND_SEEN;
6362 if (pRExC_state->num_code_blocks)
6363 r->extflags |= RXf_EVAL_SEEN;
6364 if (RExC_seen & REG_SEEN_CANY)
6365 r->extflags |= RXf_CANY_SEEN;
6366 if (RExC_seen & REG_SEEN_VERBARG)
6368 r->intflags |= PREGf_VERBARG_SEEN;
6369 r->extflags |= RXf_MODIFIES_VARS;
6371 if (RExC_seen & REG_SEEN_CUTGROUP)
6372 r->intflags |= PREGf_CUTGROUP_SEEN;
6373 if (pm_flags & PMf_USE_RE_EVAL)
6374 r->intflags |= PREGf_USE_RE_EVAL;
6375 if (RExC_paren_names)
6376 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
6378 RXp_PAREN_NAMES(r) = NULL;
6380 #ifdef STUPID_PATTERN_CHECKS
6381 if (RX_PRELEN(rx) == 0)
6382 r->extflags |= RXf_NULL;
6383 if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
6384 r->extflags |= RXf_WHITE;
6385 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
6386 r->extflags |= RXf_START_ONLY;
6389 regnode *first = ri->program + 1;
6392 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
6393 r->extflags |= RXf_NULL;
6394 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
6395 r->extflags |= RXf_START_ONLY;
6396 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
6397 && OP(regnext(first)) == END)
6398 r->extflags |= RXf_WHITE;
6402 if (RExC_paren_names) {
6403 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
6404 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
6407 ri->name_list_idx = 0;
6409 if (RExC_recurse_count) {
6410 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
6411 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
6412 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
6415 Newxz(r->offs, RExC_npar, regexp_paren_pair);
6416 /* assume we don't need to swap parens around before we match */
6419 PerlIO_printf(Perl_debug_log,"Final program:\n");
6422 #ifdef RE_TRACK_PATTERN_OFFSETS
6423 DEBUG_OFFSETS_r(if (ri->u.offsets) {
6424 const U32 len = ri->u.offsets[0];
6426 GET_RE_DEBUG_FLAGS_DECL;
6427 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
6428 for (i = 1; i <= len; i++) {
6429 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
6430 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
6431 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
6433 PerlIO_printf(Perl_debug_log, "\n");
6441 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
6444 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
6446 PERL_UNUSED_ARG(value);
6448 if (flags & RXapif_FETCH) {
6449 return reg_named_buff_fetch(rx, key, flags);
6450 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6451 Perl_croak_no_modify(aTHX);
6453 } else if (flags & RXapif_EXISTS) {
6454 return reg_named_buff_exists(rx, key, flags)
6457 } else if (flags & RXapif_REGNAMES) {
6458 return reg_named_buff_all(rx, flags);
6459 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
6460 return reg_named_buff_scalar(rx, flags);
6462 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
6468 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
6471 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
6472 PERL_UNUSED_ARG(lastkey);
6474 if (flags & RXapif_FIRSTKEY)
6475 return reg_named_buff_firstkey(rx, flags);
6476 else if (flags & RXapif_NEXTKEY)
6477 return reg_named_buff_nextkey(rx, flags);
6479 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
6485 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
6488 AV *retarray = NULL;
6490 struct regexp *const rx = (struct regexp *)SvANY(r);
6492 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
6494 if (flags & RXapif_ALL)
6497 if (rx && RXp_PAREN_NAMES(rx)) {
6498 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
6501 SV* sv_dat=HeVAL(he_str);
6502 I32 *nums=(I32*)SvPVX(sv_dat);
6503 for ( i=0; i<SvIVX(sv_dat); i++ ) {
6504 if ((I32)(rx->nparens) >= nums[i]
6505 && rx->offs[nums[i]].start != -1
6506 && rx->offs[nums[i]].end != -1)
6509 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
6514 ret = newSVsv(&PL_sv_undef);
6517 av_push(retarray, ret);
6520 return newRV_noinc(MUTABLE_SV(retarray));
6527 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
6530 struct regexp *const rx = (struct regexp *)SvANY(r);
6532 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
6534 if (rx && RXp_PAREN_NAMES(rx)) {
6535 if (flags & RXapif_ALL) {
6536 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
6538 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6552 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
6554 struct regexp *const rx = (struct regexp *)SvANY(r);
6556 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
6558 if ( rx && RXp_PAREN_NAMES(rx) ) {
6559 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
6561 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
6568 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
6570 struct regexp *const rx = (struct regexp *)SvANY(r);
6571 GET_RE_DEBUG_FLAGS_DECL;
6573 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
6575 if (rx && RXp_PAREN_NAMES(rx)) {
6576 HV *hv = RXp_PAREN_NAMES(rx);
6578 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6581 SV* sv_dat = HeVAL(temphe);
6582 I32 *nums = (I32*)SvPVX(sv_dat);
6583 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6584 if ((I32)(rx->lastparen) >= nums[i] &&
6585 rx->offs[nums[i]].start != -1 &&
6586 rx->offs[nums[i]].end != -1)
6592 if (parno || flags & RXapif_ALL) {
6593 return newSVhek(HeKEY_hek(temphe));
6601 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
6606 struct regexp *const rx = (struct regexp *)SvANY(r);
6608 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
6610 if (rx && RXp_PAREN_NAMES(rx)) {
6611 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
6612 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
6613 } else if (flags & RXapif_ONE) {
6614 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
6615 av = MUTABLE_AV(SvRV(ret));
6616 length = av_len(av);
6618 return newSViv(length + 1);
6620 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
6624 return &PL_sv_undef;
6628 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
6630 struct regexp *const rx = (struct regexp *)SvANY(r);
6633 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
6635 if (rx && RXp_PAREN_NAMES(rx)) {
6636 HV *hv= RXp_PAREN_NAMES(rx);
6638 (void)hv_iterinit(hv);
6639 while ( (temphe = hv_iternext_flags(hv,0)) ) {
6642 SV* sv_dat = HeVAL(temphe);
6643 I32 *nums = (I32*)SvPVX(sv_dat);
6644 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
6645 if ((I32)(rx->lastparen) >= nums[i] &&
6646 rx->offs[nums[i]].start != -1 &&
6647 rx->offs[nums[i]].end != -1)
6653 if (parno || flags & RXapif_ALL) {
6654 av_push(av, newSVhek(HeKEY_hek(temphe)));
6659 return newRV_noinc(MUTABLE_SV(av));
6663 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
6666 struct regexp *const rx = (struct regexp *)SvANY(r);
6672 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
6674 if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
6675 || n == RX_BUFF_IDX_CARET_FULLMATCH
6676 || n == RX_BUFF_IDX_CARET_POSTMATCH
6678 && !(rx->extflags & RXf_PMf_KEEPCOPY)
6685 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
6686 /* no need to distinguish between them any more */
6687 n = RX_BUFF_IDX_FULLMATCH;
6689 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
6690 && rx->offs[0].start != -1)
6692 /* $`, ${^PREMATCH} */
6693 i = rx->offs[0].start;
6697 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
6698 && rx->offs[0].end != -1)
6700 /* $', ${^POSTMATCH} */
6701 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
6702 i = rx->sublen + rx->suboffset - rx->offs[0].end;
6705 if ( 0 <= n && n <= (I32)rx->nparens &&
6706 (s1 = rx->offs[n].start) != -1 &&
6707 (t1 = rx->offs[n].end) != -1)
6709 /* $&, ${^MATCH}, $1 ... */
6711 s = rx->subbeg + s1 - rx->suboffset;
6716 assert(s >= rx->subbeg);
6717 assert(rx->sublen >= (s - rx->subbeg) + i );
6719 const int oldtainted = PL_tainted;
6721 sv_setpvn(sv, s, i);
6722 PL_tainted = oldtainted;
6723 if ( (rx->extflags & RXf_CANY_SEEN)
6724 ? (RXp_MATCH_UTF8(rx)
6725 && (!i || is_utf8_string((U8*)s, i)))
6726 : (RXp_MATCH_UTF8(rx)) )
6733 if (RXp_MATCH_TAINTED(rx)) {
6734 if (SvTYPE(sv) >= SVt_PVMG) {
6735 MAGIC* const mg = SvMAGIC(sv);
6738 SvMAGIC_set(sv, mg->mg_moremagic);
6740 if ((mgt = SvMAGIC(sv))) {
6741 mg->mg_moremagic = mgt;
6742 SvMAGIC_set(sv, mg);
6753 sv_setsv(sv,&PL_sv_undef);
6759 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
6760 SV const * const value)
6762 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
6764 PERL_UNUSED_ARG(rx);
6765 PERL_UNUSED_ARG(paren);
6766 PERL_UNUSED_ARG(value);
6769 Perl_croak_no_modify(aTHX);
6773 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
6776 struct regexp *const rx = (struct regexp *)SvANY(r);
6780 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
6782 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
6784 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
6785 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6789 case RX_BUFF_IDX_PREMATCH: /* $` */
6790 if (rx->offs[0].start != -1) {
6791 i = rx->offs[0].start;
6800 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
6801 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6803 case RX_BUFF_IDX_POSTMATCH: /* $' */
6804 if (rx->offs[0].end != -1) {
6805 i = rx->sublen - rx->offs[0].end;
6807 s1 = rx->offs[0].end;
6814 case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
6815 if (!(rx->extflags & RXf_PMf_KEEPCOPY))
6819 /* $& / ${^MATCH}, $1, $2, ... */
6821 if (paren <= (I32)rx->nparens &&
6822 (s1 = rx->offs[paren].start) != -1 &&
6823 (t1 = rx->offs[paren].end) != -1)
6829 if (ckWARN(WARN_UNINITIALIZED))
6830 report_uninit((const SV *)sv);
6835 if (i > 0 && RXp_MATCH_UTF8(rx)) {
6836 const char * const s = rx->subbeg - rx->suboffset + s1;
6841 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6848 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
6850 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
6851 PERL_UNUSED_ARG(rx);
6855 return newSVpvs("Regexp");
6858 /* Scans the name of a named buffer from the pattern.
6859 * If flags is REG_RSN_RETURN_NULL returns null.
6860 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6861 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6862 * to the parsed name as looked up in the RExC_paren_names hash.
6863 * If there is an error throws a vFAIL().. type exception.
6866 #define REG_RSN_RETURN_NULL 0
6867 #define REG_RSN_RETURN_NAME 1
6868 #define REG_RSN_RETURN_DATA 2
6871 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6873 char *name_start = RExC_parse;
6875 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6877 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6878 /* skip IDFIRST by using do...while */
6881 RExC_parse += UTF8SKIP(RExC_parse);
6882 } while (isALNUM_utf8((U8*)RExC_parse));
6886 } while (isALNUM(*RExC_parse));
6888 RExC_parse++; /* so the <- from the vFAIL is after the offending character */
6889 vFAIL("Group name must start with a non-digit word character");
6893 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6894 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
6895 if ( flags == REG_RSN_RETURN_NAME)
6897 else if (flags==REG_RSN_RETURN_DATA) {
6900 if ( ! sv_name ) /* should not happen*/
6901 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6902 if (RExC_paren_names)
6903 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6905 sv_dat = HeVAL(he_str);
6907 vFAIL("Reference to nonexistent named group");
6911 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6912 (unsigned long) flags);
6914 assert(0); /* NOT REACHED */
6919 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6920 int rem=(int)(RExC_end - RExC_parse); \
6929 if (RExC_lastparse!=RExC_parse) \
6930 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6933 iscut ? "..." : "<" \
6936 PerlIO_printf(Perl_debug_log,"%16s",""); \
6939 num = RExC_size + 1; \
6941 num=REG_NODE_NUM(RExC_emit); \
6942 if (RExC_lastnum!=num) \
6943 PerlIO_printf(Perl_debug_log,"|%4d",num); \
6945 PerlIO_printf(Perl_debug_log,"|%4s",""); \
6946 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6947 (int)((depth*2)), "", \
6951 RExC_lastparse=RExC_parse; \
6956 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6957 DEBUG_PARSE_MSG((funcname)); \
6958 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6960 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6961 DEBUG_PARSE_MSG((funcname)); \
6962 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6965 /* This section of code defines the inversion list object and its methods. The
6966 * interfaces are highly subject to change, so as much as possible is static to
6967 * this file. An inversion list is here implemented as a malloc'd C UV array
6968 * with some added info that is placed as UVs at the beginning in a header
6969 * portion. An inversion list for Unicode is an array of code points, sorted
6970 * by ordinal number. The zeroth element is the first code point in the list.
6971 * The 1th element is the first element beyond that not in the list. In other
6972 * words, the first range is
6973 * invlist[0]..(invlist[1]-1)
6974 * The other ranges follow. Thus every element whose index is divisible by two
6975 * marks the beginning of a range that is in the list, and every element not
6976 * divisible by two marks the beginning of a range not in the list. A single
6977 * element inversion list that contains the single code point N generally
6978 * consists of two elements
6981 * (The exception is when N is the highest representable value on the
6982 * machine, in which case the list containing just it would be a single
6983 * element, itself. By extension, if the last range in the list extends to
6984 * infinity, then the first element of that range will be in the inversion list
6985 * at a position that is divisible by two, and is the final element in the
6987 * Taking the complement (inverting) an inversion list is quite simple, if the
6988 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6989 * This implementation reserves an element at the beginning of each inversion
6990 * list to contain 0 when the list contains 0, and contains 1 otherwise. The
6991 * actual beginning of the list is either that element if 0, or the next one if
6994 * More about inversion lists can be found in "Unicode Demystified"
6995 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
6996 * More will be coming when functionality is added later.
6998 * The inversion list data structure is currently implemented as an SV pointing
6999 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7000 * array of UV whose memory management is automatically handled by the existing
7001 * facilities for SV's.
7003 * Some of the methods should always be private to the implementation, and some
7004 * should eventually be made public */
7006 /* The header definitions are in F<inline_invlist.c> */
7008 #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
7009 #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
7011 #define INVLIST_INITIAL_LEN 10
7013 PERL_STATIC_INLINE UV*
7014 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7016 /* Returns a pointer to the first element in the inversion list's array.
7017 * This is called upon initialization of an inversion list. Where the
7018 * array begins depends on whether the list has the code point U+0000
7019 * in it or not. The other parameter tells it whether the code that
7020 * follows this call is about to put a 0 in the inversion list or not.
7021 * The first element is either the element with 0, if 0, or the next one,
7024 UV* zero = get_invlist_zero_addr(invlist);
7026 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7029 assert(! *_get_invlist_len_addr(invlist));
7031 /* 1^1 = 0; 1^0 = 1 */
7032 *zero = 1 ^ will_have_0;
7033 return zero + *zero;
7036 PERL_STATIC_INLINE UV*
7037 S_invlist_array(pTHX_ SV* const invlist)
7039 /* Returns the pointer to the inversion list's array. Every time the
7040 * length changes, this needs to be called in case malloc or realloc moved
7043 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7045 /* Must not be empty. If these fail, you probably didn't check for <len>
7046 * being non-zero before trying to get the array */
7047 assert(*_get_invlist_len_addr(invlist));
7048 assert(*get_invlist_zero_addr(invlist) == 0
7049 || *get_invlist_zero_addr(invlist) == 1);
7051 /* The array begins either at the element reserved for zero if the
7052 * list contains 0 (that element will be set to 0), or otherwise the next
7053 * element (in which case the reserved element will be set to 1). */
7054 return (UV *) (get_invlist_zero_addr(invlist)
7055 + *get_invlist_zero_addr(invlist));
7058 PERL_STATIC_INLINE void
7059 S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
7061 /* Sets the current number of elements stored in the inversion list */
7063 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7065 *_get_invlist_len_addr(invlist) = len;
7067 assert(len <= SvLEN(invlist));
7069 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
7070 /* If the list contains U+0000, that element is part of the header,
7071 * and should not be counted as part of the array. It will contain
7072 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
7074 * SvCUR_set(invlist,
7075 * TO_INTERNAL_SIZE(len
7076 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
7077 * But, this is only valid if len is not 0. The consequences of not doing
7078 * this is that the memory allocation code may think that 1 more UV is
7079 * being used than actually is, and so might do an unnecessary grow. That
7080 * seems worth not bothering to make this the precise amount.
7082 * Note that when inverting, SvCUR shouldn't change */
7085 PERL_STATIC_INLINE IV*
7086 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7088 /* Return the address of the UV that is reserved to hold the cached index
7091 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7093 return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
7096 PERL_STATIC_INLINE IV
7097 S_invlist_previous_index(pTHX_ SV* const invlist)
7099 /* Returns cached index of previous search */
7101 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7103 return *get_invlist_previous_index_addr(invlist);
7106 PERL_STATIC_INLINE void
7107 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7109 /* Caches <index> for later retrieval */
7111 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7113 assert(index == 0 || index < (int) _invlist_len(invlist));
7115 *get_invlist_previous_index_addr(invlist) = index;
7118 PERL_STATIC_INLINE UV
7119 S_invlist_max(pTHX_ SV* const invlist)
7121 /* Returns the maximum number of elements storable in the inversion list's
7122 * array, without having to realloc() */
7124 PERL_ARGS_ASSERT_INVLIST_MAX;
7126 return FROM_INTERNAL_SIZE(SvLEN(invlist));
7129 PERL_STATIC_INLINE UV*
7130 S_get_invlist_zero_addr(pTHX_ SV* invlist)
7132 /* Return the address of the UV that is reserved to hold 0 if the inversion
7133 * list contains 0. This has to be the last element of the heading, as the
7134 * list proper starts with either it if 0, or the next element if not.
7135 * (But we force it to contain either 0 or 1) */
7137 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
7139 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
7142 #ifndef PERL_IN_XSUB_RE
7144 Perl__new_invlist(pTHX_ IV initial_size)
7147 /* Return a pointer to a newly constructed inversion list, with enough
7148 * space to store 'initial_size' elements. If that number is negative, a
7149 * system default is used instead */
7153 if (initial_size < 0) {
7154 initial_size = INVLIST_INITIAL_LEN;
7157 /* Allocate the initial space */
7158 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
7159 invlist_set_len(new_list, 0);
7161 /* Force iterinit() to be used to get iteration to work */
7162 *get_invlist_iter_addr(new_list) = UV_MAX;
7164 /* This should force a segfault if a method doesn't initialize this
7166 *get_invlist_zero_addr(new_list) = UV_MAX;
7168 *get_invlist_previous_index_addr(new_list) = 0;
7169 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
7170 #if HEADER_LENGTH != 5
7171 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
7179 S__new_invlist_C_array(pTHX_ UV* list)
7181 /* Return a pointer to a newly constructed inversion list, initialized to
7182 * point to <list>, which has to be in the exact correct inversion list
7183 * form, including internal fields. Thus this is a dangerous routine that
7184 * should not be used in the wrong hands */
7186 SV* invlist = newSV_type(SVt_PV);
7188 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7190 SvPV_set(invlist, (char *) list);
7191 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7192 shouldn't touch it */
7193 SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
7195 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
7196 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7203 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7205 /* Grow the maximum size of an inversion list */
7207 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7209 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
7212 PERL_STATIC_INLINE void
7213 S_invlist_trim(pTHX_ SV* const invlist)
7215 PERL_ARGS_ASSERT_INVLIST_TRIM;
7217 /* Change the length of the inversion list to how many entries it currently
7220 SvPV_shrink_to_cur((SV *) invlist);
7223 #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
7226 S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
7228 /* Subject to change or removal. Append the range from 'start' to 'end' at
7229 * the end of the inversion list. The range must be above any existing
7233 UV max = invlist_max(invlist);
7234 UV len = _invlist_len(invlist);
7236 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
7238 if (len == 0) { /* Empty lists must be initialized */
7239 array = _invlist_array_init(invlist, start == 0);
7242 /* Here, the existing list is non-empty. The current max entry in the
7243 * list is generally the first value not in the set, except when the
7244 * set extends to the end of permissible values, in which case it is
7245 * the first entry in that final set, and so this call is an attempt to
7246 * append out-of-order */
7248 UV final_element = len - 1;
7249 array = invlist_array(invlist);
7250 if (array[final_element] > start
7251 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
7253 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
7254 array[final_element], start,
7255 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
7258 /* Here, it is a legal append. If the new range begins with the first
7259 * value not in the set, it is extending the set, so the new first
7260 * value not in the set is one greater than the newly extended range.
7262 if (array[final_element] == start) {
7263 if (end != UV_MAX) {
7264 array[final_element] = end + 1;
7267 /* But if the end is the maximum representable on the machine,
7268 * just let the range that this would extend to have no end */
7269 invlist_set_len(invlist, len - 1);
7275 /* Here the new range doesn't extend any existing set. Add it */
7277 len += 2; /* Includes an element each for the start and end of range */
7279 /* If overflows the existing space, extend, which may cause the array to be
7282 invlist_extend(invlist, len);
7283 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
7284 failure in invlist_array() */
7285 array = invlist_array(invlist);
7288 invlist_set_len(invlist, len);
7291 /* The next item on the list starts the range, the one after that is
7292 * one past the new range. */
7293 array[len - 2] = start;
7294 if (end != UV_MAX) {
7295 array[len - 1] = end + 1;
7298 /* But if the end is the maximum representable on the machine, just let
7299 * the range have no end */
7300 invlist_set_len(invlist, len - 1);
7304 #ifndef PERL_IN_XSUB_RE
7307 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
7309 /* Searches the inversion list for the entry that contains the input code
7310 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
7311 * return value is the index into the list's array of the range that
7316 IV high = _invlist_len(invlist);
7317 const IV highest_element = high - 1;
7320 PERL_ARGS_ASSERT__INVLIST_SEARCH;
7322 /* If list is empty, return failure. */
7327 /* If the code point is before the first element, return failure. (We
7328 * can't combine this with the test above, because we can't get the array
7329 * unless we know the list is non-empty) */
7330 array = invlist_array(invlist);
7332 mid = invlist_previous_index(invlist);
7333 assert(mid >=0 && mid <= highest_element);
7335 /* <mid> contains the cache of the result of the previous call to this
7336 * function (0 the first time). See if this call is for the same result,
7337 * or if it is for mid-1. This is under the theory that calls to this
7338 * function will often be for related code points that are near each other.
7339 * And benchmarks show that caching gives better results. We also test
7340 * here if the code point is within the bounds of the list. These tests
7341 * replace others that would have had to be made anyway to make sure that
7342 * the array bounds were not exceeded, and give us extra information at the
7344 if (cp >= array[mid]) {
7345 if (cp >= array[highest_element]) {
7346 return highest_element;
7349 /* Here, array[mid] <= cp < array[highest_element]. This means that
7350 * the final element is not the answer, so can exclude it; it also
7351 * means that <mid> is not the final element, so can refer to 'mid + 1'
7353 if (cp < array[mid + 1]) {
7359 else { /* cp < aray[mid] */
7360 if (cp < array[0]) { /* Fail if outside the array */
7364 if (cp >= array[mid - 1]) {
7369 /* Binary search. What we are looking for is <i> such that
7370 * array[i] <= cp < array[i+1]
7371 * The loop below converges on the i+1. Note that there may not be an
7372 * (i+1)th element in the array, and things work nonetheless */
7373 while (low < high) {
7374 mid = (low + high) / 2;
7375 assert(mid <= highest_element);
7376 if (array[mid] <= cp) { /* cp >= array[mid] */
7379 /* We could do this extra test to exit the loop early.
7380 if (cp < array[low]) {
7385 else { /* cp < array[mid] */
7392 invlist_set_previous_index(invlist, high);
7397 Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
7399 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
7400 * but is used when the swash has an inversion list. This makes this much
7401 * faster, as it uses a binary search instead of a linear one. This is
7402 * intimately tied to that function, and perhaps should be in utf8.c,
7403 * except it is intimately tied to inversion lists as well. It assumes
7404 * that <swatch> is all 0's on input */
7407 const IV len = _invlist_len(invlist);
7411 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
7413 if (len == 0) { /* Empty inversion list */
7417 array = invlist_array(invlist);
7419 /* Find which element it is */
7420 i = _invlist_search(invlist, start);
7422 /* We populate from <start> to <end> */
7423 while (current < end) {
7426 /* The inversion list gives the results for every possible code point
7427 * after the first one in the list. Only those ranges whose index is
7428 * even are ones that the inversion list matches. For the odd ones,
7429 * and if the initial code point is not in the list, we have to skip
7430 * forward to the next element */
7431 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
7433 if (i >= len) { /* Finished if beyond the end of the array */
7437 if (current >= end) { /* Finished if beyond the end of what we
7439 if (LIKELY(end < UV_MAX)) {
7443 /* We get here when the upper bound is the maximum
7444 * representable on the machine, and we are looking for just
7445 * that code point. Have to special case it */
7447 goto join_end_of_list;
7450 assert(current >= start);
7452 /* The current range ends one below the next one, except don't go past
7455 upper = (i < len && array[i] < end) ? array[i] : end;
7457 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
7458 * for each code point in it */
7459 for (; current < upper; current++) {
7460 const STRLEN offset = (STRLEN)(current - start);
7461 swatch[offset >> 3] |= 1 << (offset & 7);
7466 /* Quit if at the end of the list */
7469 /* But first, have to deal with the highest possible code point on
7470 * the platform. The previous code assumes that <end> is one
7471 * beyond where we want to populate, but that is impossible at the
7472 * platform's infinity, so have to handle it specially */
7473 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
7475 const STRLEN offset = (STRLEN)(end - start);
7476 swatch[offset >> 3] |= 1 << (offset & 7);
7481 /* Advance to the next range, which will be for code points not in the
7490 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
7492 /* Take the union of two inversion lists and point <output> to it. *output
7493 * should be defined upon input, and if it points to one of the two lists,
7494 * the reference count to that list will be decremented. The first list,
7495 * <a>, may be NULL, in which case a copy of the second list is returned.
7496 * If <complement_b> is TRUE, the union is taken of the complement
7497 * (inversion) of <b> instead of b itself.
7499 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7500 * Richard Gillam, published by Addison-Wesley, and explained at some
7501 * length there. The preface says to incorporate its examples into your
7502 * code at your own risk.
7504 * The algorithm is like a merge sort.
7506 * XXX A potential performance improvement is to keep track as we go along
7507 * if only one of the inputs contributes to the result, meaning the other
7508 * is a subset of that one. In that case, we can skip the final copy and
7509 * return the larger of the input lists, but then outside code might need
7510 * to keep track of whether to free the input list or not */
7512 UV* array_a; /* a's array */
7514 UV len_a; /* length of a's array */
7517 SV* u; /* the resulting union */
7521 UV i_a = 0; /* current index into a's array */
7525 /* running count, as explained in the algorithm source book; items are
7526 * stopped accumulating and are output when the count changes to/from 0.
7527 * The count is incremented when we start a range that's in the set, and
7528 * decremented when we start a range that's not in the set. So its range
7529 * is 0 to 2. Only when the count is zero is something not in the set.
7533 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
7536 /* If either one is empty, the union is the other one */
7537 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
7544 *output = invlist_clone(b);
7546 _invlist_invert(*output);
7548 } /* else *output already = b; */
7551 else if ((len_b = _invlist_len(b)) == 0) {
7556 /* The complement of an empty list is a list that has everything in it,
7557 * so the union with <a> includes everything too */
7562 *output = _new_invlist(1);
7563 _append_range_to_invlist(*output, 0, UV_MAX);
7565 else if (*output != a) {
7566 *output = invlist_clone(a);
7568 /* else *output already = a; */
7572 /* Here both lists exist and are non-empty */
7573 array_a = invlist_array(a);
7574 array_b = invlist_array(b);
7576 /* If are to take the union of 'a' with the complement of b, set it
7577 * up so are looking at b's complement. */
7580 /* To complement, we invert: if the first element is 0, remove it. To
7581 * do this, we just pretend the array starts one later, and clear the
7582 * flag as we don't have to do anything else later */
7583 if (array_b[0] == 0) {
7586 complement_b = FALSE;
7590 /* But if the first element is not zero, we unshift a 0 before the
7591 * array. The data structure reserves a space for that 0 (which
7592 * should be a '1' right now), so physical shifting is unneeded,
7593 * but temporarily change that element to 0. Before exiting the
7594 * routine, we must restore the element to '1' */
7601 /* Size the union for the worst case: that the sets are completely
7603 u = _new_invlist(len_a + len_b);
7605 /* Will contain U+0000 if either component does */
7606 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
7607 || (len_b > 0 && array_b[0] == 0));
7609 /* Go through each list item by item, stopping when exhausted one of
7611 while (i_a < len_a && i_b < len_b) {
7612 UV cp; /* The element to potentially add to the union's array */
7613 bool cp_in_set; /* is it in the the input list's set or not */
7615 /* We need to take one or the other of the two inputs for the union.
7616 * Since we are merging two sorted lists, we take the smaller of the
7617 * next items. In case of a tie, we take the one that is in its set
7618 * first. If we took one not in the set first, it would decrement the
7619 * count, possibly to 0 which would cause it to be output as ending the
7620 * range, and the next time through we would take the same number, and
7621 * output it again as beginning the next range. By doing it the
7622 * opposite way, there is no possibility that the count will be
7623 * momentarily decremented to 0, and thus the two adjoining ranges will
7624 * be seamlessly merged. (In a tie and both are in the set or both not
7625 * in the set, it doesn't matter which we take first.) */
7626 if (array_a[i_a] < array_b[i_b]
7627 || (array_a[i_a] == array_b[i_b]
7628 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7630 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7634 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7638 /* Here, have chosen which of the two inputs to look at. Only output
7639 * if the running count changes to/from 0, which marks the
7640 * beginning/end of a range in that's in the set */
7643 array_u[i_u++] = cp;
7650 array_u[i_u++] = cp;
7655 /* Here, we are finished going through at least one of the lists, which
7656 * means there is something remaining in at most one. We check if the list
7657 * that hasn't been exhausted is positioned such that we are in the middle
7658 * of a range in its set or not. (i_a and i_b point to the element beyond
7659 * the one we care about.) If in the set, we decrement 'count'; if 0, there
7660 * is potentially more to output.
7661 * There are four cases:
7662 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
7663 * in the union is entirely from the non-exhausted set.
7664 * 2) Both were in their sets, count is 2. Nothing further should
7665 * be output, as everything that remains will be in the exhausted
7666 * list's set, hence in the union; decrementing to 1 but not 0 insures
7668 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
7669 * Nothing further should be output because the union includes
7670 * everything from the exhausted set. Not decrementing ensures that.
7671 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
7672 * decrementing to 0 insures that we look at the remainder of the
7673 * non-exhausted set */
7674 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7675 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7680 /* The final length is what we've output so far, plus what else is about to
7681 * be output. (If 'count' is non-zero, then the input list we exhausted
7682 * has everything remaining up to the machine's limit in its set, and hence
7683 * in the union, so there will be no further output. */
7686 /* At most one of the subexpressions will be non-zero */
7687 len_u += (len_a - i_a) + (len_b - i_b);
7690 /* Set result to final length, which can change the pointer to array_u, so
7692 if (len_u != _invlist_len(u)) {
7693 invlist_set_len(u, len_u);
7695 array_u = invlist_array(u);
7698 /* When 'count' is 0, the list that was exhausted (if one was shorter than
7699 * the other) ended with everything above it not in its set. That means
7700 * that the remaining part of the union is precisely the same as the
7701 * non-exhausted list, so can just copy it unchanged. (If both list were
7702 * exhausted at the same time, then the operations below will be both 0.)
7705 IV copy_count; /* At most one will have a non-zero copy count */
7706 if ((copy_count = len_a - i_a) > 0) {
7707 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
7709 else if ((copy_count = len_b - i_b) > 0) {
7710 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
7714 /* We may be removing a reference to one of the inputs */
7715 if (a == *output || b == *output) {
7716 SvREFCNT_dec(*output);
7719 /* If we've changed b, restore it */
7729 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
7731 /* Take the intersection of two inversion lists and point <i> to it. *i
7732 * should be defined upon input, and if it points to one of the two lists,
7733 * the reference count to that list will be decremented.
7734 * If <complement_b> is TRUE, the result will be the intersection of <a>
7735 * and the complement (or inversion) of <b> instead of <b> directly.
7737 * The basis for this comes from "Unicode Demystified" Chapter 13 by
7738 * Richard Gillam, published by Addison-Wesley, and explained at some
7739 * length there. The preface says to incorporate its examples into your
7740 * code at your own risk. In fact, it had bugs
7742 * The algorithm is like a merge sort, and is essentially the same as the
7746 UV* array_a; /* a's array */
7748 UV len_a; /* length of a's array */
7751 SV* r; /* the resulting intersection */
7755 UV i_a = 0; /* current index into a's array */
7759 /* running count, as explained in the algorithm source book; items are
7760 * stopped accumulating and are output when the count changes to/from 2.
7761 * The count is incremented when we start a range that's in the set, and
7762 * decremented when we start a range that's not in the set. So its range
7763 * is 0 to 2. Only when the count is 2 is something in the intersection.
7767 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
7770 /* Special case if either one is empty */
7771 len_a = _invlist_len(a);
7772 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
7774 if (len_a != 0 && complement_b) {
7776 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
7777 * be empty. Here, also we are using 'b's complement, which hence
7778 * must be every possible code point. Thus the intersection is
7781 *i = invlist_clone(a);
7787 /* else *i is already 'a' */
7791 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
7792 * intersection must be empty */
7799 *i = _new_invlist(0);
7803 /* Here both lists exist and are non-empty */
7804 array_a = invlist_array(a);
7805 array_b = invlist_array(b);
7807 /* If are to take the intersection of 'a' with the complement of b, set it
7808 * up so are looking at b's complement. */
7811 /* To complement, we invert: if the first element is 0, remove it. To
7812 * do this, we just pretend the array starts one later, and clear the
7813 * flag as we don't have to do anything else later */
7814 if (array_b[0] == 0) {
7817 complement_b = FALSE;
7821 /* But if the first element is not zero, we unshift a 0 before the
7822 * array. The data structure reserves a space for that 0 (which
7823 * should be a '1' right now), so physical shifting is unneeded,
7824 * but temporarily change that element to 0. Before exiting the
7825 * routine, we must restore the element to '1' */
7832 /* Size the intersection for the worst case: that the intersection ends up
7833 * fragmenting everything to be completely disjoint */
7834 r= _new_invlist(len_a + len_b);
7836 /* Will contain U+0000 iff both components do */
7837 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
7838 && len_b > 0 && array_b[0] == 0);
7840 /* Go through each list item by item, stopping when exhausted one of
7842 while (i_a < len_a && i_b < len_b) {
7843 UV cp; /* The element to potentially add to the intersection's
7845 bool cp_in_set; /* Is it in the input list's set or not */
7847 /* We need to take one or the other of the two inputs for the
7848 * intersection. Since we are merging two sorted lists, we take the
7849 * smaller of the next items. In case of a tie, we take the one that
7850 * is not in its set first (a difference from the union algorithm). If
7851 * we took one in the set first, it would increment the count, possibly
7852 * to 2 which would cause it to be output as starting a range in the
7853 * intersection, and the next time through we would take that same
7854 * number, and output it again as ending the set. By doing it the
7855 * opposite of this, there is no possibility that the count will be
7856 * momentarily incremented to 2. (In a tie and both are in the set or
7857 * both not in the set, it doesn't matter which we take first.) */
7858 if (array_a[i_a] < array_b[i_b]
7859 || (array_a[i_a] == array_b[i_b]
7860 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
7862 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
7866 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
7870 /* Here, have chosen which of the two inputs to look at. Only output
7871 * if the running count changes to/from 2, which marks the
7872 * beginning/end of a range that's in the intersection */
7876 array_r[i_r++] = cp;
7881 array_r[i_r++] = cp;
7887 /* Here, we are finished going through at least one of the lists, which
7888 * means there is something remaining in at most one. We check if the list
7889 * that has been exhausted is positioned such that we are in the middle
7890 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7891 * the ones we care about.) There are four cases:
7892 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7893 * nothing left in the intersection.
7894 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7895 * above 2. What should be output is exactly that which is in the
7896 * non-exhausted set, as everything it has is also in the intersection
7897 * set, and everything it doesn't have can't be in the intersection
7898 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7899 * gets incremented to 2. Like the previous case, the intersection is
7900 * everything that remains in the non-exhausted set.
7901 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7902 * remains 1. And the intersection has nothing more. */
7903 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7904 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
7909 /* The final length is what we've output so far plus what else is in the
7910 * intersection. At most one of the subexpressions below will be non-zero */
7913 len_r += (len_a - i_a) + (len_b - i_b);
7916 /* Set result to final length, which can change the pointer to array_r, so
7918 if (len_r != _invlist_len(r)) {
7919 invlist_set_len(r, len_r);
7921 array_r = invlist_array(r);
7924 /* Finish outputting any remaining */
7925 if (count >= 2) { /* At most one will have a non-zero copy count */
7927 if ((copy_count = len_a - i_a) > 0) {
7928 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7930 else if ((copy_count = len_b - i_b) > 0) {
7931 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7935 /* We may be removing a reference to one of the inputs */
7936 if (a == *i || b == *i) {
7940 /* If we've changed b, restore it */
7950 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
7952 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7953 * set. A pointer to the inversion list is returned. This may actually be
7954 * a new list, in which case the passed in one has been destroyed. The
7955 * passed in inversion list can be NULL, in which case a new one is created
7956 * with just the one range in it */
7961 if (invlist == NULL) {
7962 invlist = _new_invlist(2);
7966 len = _invlist_len(invlist);
7969 /* If comes after the final entry, can just append it to the end */
7971 || start >= invlist_array(invlist)
7972 [_invlist_len(invlist) - 1])
7974 _append_range_to_invlist(invlist, start, end);
7978 /* Here, can't just append things, create and return a new inversion list
7979 * which is the union of this range and the existing inversion list */
7980 range_invlist = _new_invlist(2);
7981 _append_range_to_invlist(range_invlist, start, end);
7983 _invlist_union(invlist, range_invlist, &invlist);
7985 /* The temporary can be freed */
7986 SvREFCNT_dec(range_invlist);
7993 PERL_STATIC_INLINE SV*
7994 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
7995 return _add_range_to_invlist(invlist, cp, cp);
7998 #ifndef PERL_IN_XSUB_RE
8000 Perl__invlist_invert(pTHX_ SV* const invlist)
8002 /* Complement the input inversion list. This adds a 0 if the list didn't
8003 * have a zero; removes it otherwise. As described above, the data
8004 * structure is set up so that this is very efficient */
8006 UV* len_pos = _get_invlist_len_addr(invlist);
8008 PERL_ARGS_ASSERT__INVLIST_INVERT;
8010 /* The inverse of matching nothing is matching everything */
8011 if (*len_pos == 0) {
8012 _append_range_to_invlist(invlist, 0, UV_MAX);
8016 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
8017 * zero element was a 0, so it is being removed, so the length decrements
8018 * by 1; and vice-versa. SvCUR is unaffected */
8019 if (*get_invlist_zero_addr(invlist) ^= 1) {
8028 Perl__invlist_invert_prop(pTHX_ SV* const invlist)
8030 /* Complement the input inversion list (which must be a Unicode property,
8031 * all of which don't match above the Unicode maximum code point.) And
8032 * Perl has chosen to not have the inversion match above that either. This
8033 * adds a 0x110000 if the list didn't end with it, and removes it if it did
8039 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
8041 _invlist_invert(invlist);
8043 len = _invlist_len(invlist);
8045 if (len != 0) { /* If empty do nothing */
8046 array = invlist_array(invlist);
8047 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
8048 /* Add 0x110000. First, grow if necessary */
8050 if (invlist_max(invlist) < len) {
8051 invlist_extend(invlist, len);
8052 array = invlist_array(invlist);
8054 invlist_set_len(invlist, len);
8055 array[len - 1] = PERL_UNICODE_MAX + 1;
8057 else { /* Remove the 0x110000 */
8058 invlist_set_len(invlist, len - 1);
8066 PERL_STATIC_INLINE SV*
8067 S_invlist_clone(pTHX_ SV* const invlist)
8070 /* Return a new inversion list that is a copy of the input one, which is
8073 /* Need to allocate extra space to accommodate Perl's addition of a
8074 * trailing NUL to SvPV's, since it thinks they are always strings */
8075 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8076 STRLEN length = SvCUR(invlist);
8078 PERL_ARGS_ASSERT_INVLIST_CLONE;
8080 SvCUR_set(new_invlist, length); /* This isn't done automatically */
8081 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
8086 PERL_STATIC_INLINE UV*
8087 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8089 /* Return the address of the UV that contains the current iteration
8092 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8094 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
8097 PERL_STATIC_INLINE UV*
8098 S_get_invlist_version_id_addr(pTHX_ SV* invlist)
8100 /* Return the address of the UV that contains the version id. */
8102 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
8104 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
8107 PERL_STATIC_INLINE void
8108 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8110 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8112 *get_invlist_iter_addr(invlist) = 0;
8116 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8118 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8119 * This call sets in <*start> and <*end>, the next range in <invlist>.
8120 * Returns <TRUE> if successful and the next call will return the next
8121 * range; <FALSE> if was already at the end of the list. If the latter,
8122 * <*start> and <*end> are unchanged, and the next call to this function
8123 * will start over at the beginning of the list */
8125 UV* pos = get_invlist_iter_addr(invlist);
8126 UV len = _invlist_len(invlist);
8129 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8132 *pos = UV_MAX; /* Force iternit() to be required next time */
8136 array = invlist_array(invlist);
8138 *start = array[(*pos)++];
8144 *end = array[(*pos)++] - 1;
8150 PERL_STATIC_INLINE UV
8151 S_invlist_highest(pTHX_ SV* const invlist)
8153 /* Returns the highest code point that matches an inversion list. This API
8154 * has an ambiguity, as it returns 0 under either the highest is actually
8155 * 0, or if the list is empty. If this distinction matters to you, check
8156 * for emptiness before calling this function */
8158 UV len = _invlist_len(invlist);
8161 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8167 array = invlist_array(invlist);
8169 /* The last element in the array in the inversion list always starts a
8170 * range that goes to infinity. That range may be for code points that are
8171 * matched in the inversion list, or it may be for ones that aren't
8172 * matched. In the latter case, the highest code point in the set is one
8173 * less than the beginning of this range; otherwise it is the final element
8174 * of this range: infinity */
8175 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8177 : array[len - 1] - 1;
8180 #ifndef PERL_IN_XSUB_RE
8182 Perl__invlist_contents(pTHX_ SV* const invlist)
8184 /* Get the contents of an inversion list into a string SV so that they can
8185 * be printed out. It uses the format traditionally done for debug tracing
8189 SV* output = newSVpvs("\n");
8191 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
8193 invlist_iterinit(invlist);
8194 while (invlist_iternext(invlist, &start, &end)) {
8195 if (end == UV_MAX) {
8196 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
8198 else if (end != start) {
8199 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
8203 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
8213 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
8215 /* Dumps out the ranges in an inversion list. The string 'header'
8216 * if present is output on a line before the first range */
8220 if (header && strlen(header)) {
8221 PerlIO_printf(Perl_debug_log, "%s\n", header);
8223 invlist_iterinit(invlist);
8224 while (invlist_iternext(invlist, &start, &end)) {
8225 if (end == UV_MAX) {
8226 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
8229 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
8237 S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
8239 /* Return a boolean as to if the two passed in inversion lists are
8240 * identical. The final argument, if TRUE, says to take the complement of
8241 * the second inversion list before doing the comparison */
8243 UV* array_a = invlist_array(a);
8244 UV* array_b = invlist_array(b);
8245 UV len_a = _invlist_len(a);
8246 UV len_b = _invlist_len(b);
8248 UV i = 0; /* current index into the arrays */
8249 bool retval = TRUE; /* Assume are identical until proven otherwise */
8251 PERL_ARGS_ASSERT__INVLISTEQ;
8253 /* If are to compare 'a' with the complement of b, set it
8254 * up so are looking at b's complement. */
8257 /* The complement of nothing is everything, so <a> would have to have
8258 * just one element, starting at zero (ending at infinity) */
8260 return (len_a == 1 && array_a[0] == 0);
8262 else if (array_b[0] == 0) {
8264 /* Otherwise, to complement, we invert. Here, the first element is
8265 * 0, just remove it. To do this, we just pretend the array starts
8266 * one later, and clear the flag as we don't have to do anything
8271 complement_b = FALSE;
8275 /* But if the first element is not zero, we unshift a 0 before the
8276 * array. The data structure reserves a space for that 0 (which
8277 * should be a '1' right now), so physical shifting is unneeded,
8278 * but temporarily change that element to 0. Before exiting the
8279 * routine, we must restore the element to '1' */
8286 /* Make sure that the lengths are the same, as well as the final element
8287 * before looping through the remainder. (Thus we test the length, final,
8288 * and first elements right off the bat) */
8289 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
8292 else for (i = 0; i < len_a - 1; i++) {
8293 if (array_a[i] != array_b[i]) {
8306 #undef HEADER_LENGTH
8307 #undef INVLIST_INITIAL_LENGTH
8308 #undef TO_INTERNAL_SIZE
8309 #undef FROM_INTERNAL_SIZE
8310 #undef INVLIST_LEN_OFFSET
8311 #undef INVLIST_ZERO_OFFSET
8312 #undef INVLIST_ITER_OFFSET
8313 #undef INVLIST_VERSION_ID
8315 /* End of inversion list object */
8318 - reg - regular expression, i.e. main body or parenthesized thing
8320 * Caller must absorb opening parenthesis.
8322 * Combining parenthesis handling with the base level of regular expression
8323 * is a trifle forced, but the need to tie the tails of the branches to what
8324 * follows makes it hard to avoid.
8326 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
8328 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
8330 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
8334 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
8335 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
8338 regnode *ret; /* Will be the head of the group. */
8341 regnode *ender = NULL;
8344 U32 oregflags = RExC_flags;
8345 bool have_branch = 0;
8347 I32 freeze_paren = 0;
8348 I32 after_freeze = 0;
8350 /* for (?g), (?gc), and (?o) warnings; warning
8351 about (?c) will warn about (?g) -- japhy */
8353 #define WASTED_O 0x01
8354 #define WASTED_G 0x02
8355 #define WASTED_C 0x04
8356 #define WASTED_GC (0x02|0x04)
8357 I32 wastedflags = 0x00;
8359 char * parse_start = RExC_parse; /* MJD */
8360 char * const oregcomp_parse = RExC_parse;
8362 GET_RE_DEBUG_FLAGS_DECL;
8364 PERL_ARGS_ASSERT_REG;
8365 DEBUG_PARSE("reg ");
8367 *flagp = 0; /* Tentatively. */
8370 /* Make an OPEN node, if parenthesized. */
8372 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
8373 char *start_verb = RExC_parse;
8374 STRLEN verb_len = 0;
8375 char *start_arg = NULL;
8376 unsigned char op = 0;
8378 int internal_argval = 0; /* internal_argval is only useful if !argok */
8379 while ( *RExC_parse && *RExC_parse != ')' ) {
8380 if ( *RExC_parse == ':' ) {
8381 start_arg = RExC_parse + 1;
8387 verb_len = RExC_parse - start_verb;
8390 while ( *RExC_parse && *RExC_parse != ')' )
8392 if ( *RExC_parse != ')' )
8393 vFAIL("Unterminated verb pattern argument");
8394 if ( RExC_parse == start_arg )
8397 if ( *RExC_parse != ')' )
8398 vFAIL("Unterminated verb pattern");
8401 switch ( *start_verb ) {
8402 case 'A': /* (*ACCEPT) */
8403 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
8405 internal_argval = RExC_nestroot;
8408 case 'C': /* (*COMMIT) */
8409 if ( memEQs(start_verb,verb_len,"COMMIT") )
8412 case 'F': /* (*FAIL) */
8413 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
8418 case ':': /* (*:NAME) */
8419 case 'M': /* (*MARK:NAME) */
8420 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
8425 case 'P': /* (*PRUNE) */
8426 if ( memEQs(start_verb,verb_len,"PRUNE") )
8429 case 'S': /* (*SKIP) */
8430 if ( memEQs(start_verb,verb_len,"SKIP") )
8433 case 'T': /* (*THEN) */
8434 /* [19:06] <TimToady> :: is then */
8435 if ( memEQs(start_verb,verb_len,"THEN") ) {
8437 RExC_seen |= REG_SEEN_CUTGROUP;
8443 vFAIL3("Unknown verb pattern '%.*s'",
8444 verb_len, start_verb);
8447 if ( start_arg && internal_argval ) {
8448 vFAIL3("Verb pattern '%.*s' may not have an argument",
8449 verb_len, start_verb);
8450 } else if ( argok < 0 && !start_arg ) {
8451 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
8452 verb_len, start_verb);
8454 ret = reganode(pRExC_state, op, internal_argval);
8455 if ( ! internal_argval && ! SIZE_ONLY ) {
8457 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
8458 ARG(ret) = add_data( pRExC_state, 1, "S" );
8459 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
8466 if (!internal_argval)
8467 RExC_seen |= REG_SEEN_VERBARG;
8468 } else if ( start_arg ) {
8469 vFAIL3("Verb pattern '%.*s' may not have an argument",
8470 verb_len, start_verb);
8472 ret = reg_node(pRExC_state, op);
8474 nextchar(pRExC_state);
8477 if (*RExC_parse == '?') { /* (?...) */
8478 bool is_logical = 0;
8479 const char * const seqstart = RExC_parse;
8480 bool has_use_defaults = FALSE;
8483 paren = *RExC_parse++;
8484 ret = NULL; /* For look-ahead/behind. */
8487 case 'P': /* (?P...) variants for those used to PCRE/Python */
8488 paren = *RExC_parse++;
8489 if ( paren == '<') /* (?P<...>) named capture */
8491 else if (paren == '>') { /* (?P>name) named recursion */
8492 goto named_recursion;
8494 else if (paren == '=') { /* (?P=...) named backref */
8495 /* this pretty much dupes the code for \k<NAME> in regatom(), if
8496 you change this make sure you change that */
8497 char* name_start = RExC_parse;
8499 SV *sv_dat = reg_scan_name(pRExC_state,
8500 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8501 if (RExC_parse == name_start || *RExC_parse != ')')
8502 vFAIL2("Sequence %.3s... not terminated",parse_start);
8505 num = add_data( pRExC_state, 1, "S" );
8506 RExC_rxi->data->data[num]=(void*)sv_dat;
8507 SvREFCNT_inc_simple_void(sv_dat);
8510 ret = reganode(pRExC_state,
8513 : (ASCII_FOLD_RESTRICTED)
8515 : (AT_LEAST_UNI_SEMANTICS)
8523 Set_Node_Offset(ret, parse_start+1);
8524 Set_Node_Cur_Length(ret); /* MJD */
8526 nextchar(pRExC_state);
8530 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8532 case '<': /* (?<...) */
8533 if (*RExC_parse == '!')
8535 else if (*RExC_parse != '=')
8541 case '\'': /* (?'...') */
8542 name_start= RExC_parse;
8543 svname = reg_scan_name(pRExC_state,
8544 SIZE_ONLY ? /* reverse test from the others */
8545 REG_RSN_RETURN_NAME :
8546 REG_RSN_RETURN_NULL);
8547 if (RExC_parse == name_start) {
8549 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8552 if (*RExC_parse != paren)
8553 vFAIL2("Sequence (?%c... not terminated",
8554 paren=='>' ? '<' : paren);
8558 if (!svname) /* shouldn't happen */
8560 "panic: reg_scan_name returned NULL");
8561 if (!RExC_paren_names) {
8562 RExC_paren_names= newHV();
8563 sv_2mortal(MUTABLE_SV(RExC_paren_names));
8565 RExC_paren_name_list= newAV();
8566 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
8569 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
8571 sv_dat = HeVAL(he_str);
8573 /* croak baby croak */
8575 "panic: paren_name hash element allocation failed");
8576 } else if ( SvPOK(sv_dat) ) {
8577 /* (?|...) can mean we have dupes so scan to check
8578 its already been stored. Maybe a flag indicating
8579 we are inside such a construct would be useful,
8580 but the arrays are likely to be quite small, so
8581 for now we punt -- dmq */
8582 IV count = SvIV(sv_dat);
8583 I32 *pv = (I32*)SvPVX(sv_dat);
8585 for ( i = 0 ; i < count ; i++ ) {
8586 if ( pv[i] == RExC_npar ) {
8592 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
8593 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
8594 pv[count] = RExC_npar;
8595 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
8598 (void)SvUPGRADE(sv_dat,SVt_PVNV);
8599 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
8601 SvIV_set(sv_dat, 1);
8604 /* Yes this does cause a memory leak in debugging Perls */
8605 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
8606 SvREFCNT_dec(svname);
8609 /*sv_dump(sv_dat);*/
8611 nextchar(pRExC_state);
8613 goto capturing_parens;
8615 RExC_seen |= REG_SEEN_LOOKBEHIND;
8616 RExC_in_lookbehind++;
8618 case '=': /* (?=...) */
8619 RExC_seen_zerolen++;
8621 case '!': /* (?!...) */
8622 RExC_seen_zerolen++;
8623 if (*RExC_parse == ')') {
8624 ret=reg_node(pRExC_state, OPFAIL);
8625 nextchar(pRExC_state);
8629 case '|': /* (?|...) */
8630 /* branch reset, behave like a (?:...) except that
8631 buffers in alternations share the same numbers */
8633 after_freeze = freeze_paren = RExC_npar;
8635 case ':': /* (?:...) */
8636 case '>': /* (?>...) */
8638 case '$': /* (?$...) */
8639 case '@': /* (?@...) */
8640 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
8642 case '#': /* (?#...) */
8643 while (*RExC_parse && *RExC_parse != ')')
8645 if (*RExC_parse != ')')
8646 FAIL("Sequence (?#... not terminated");
8647 nextchar(pRExC_state);
8650 case '0' : /* (?0) */
8651 case 'R' : /* (?R) */
8652 if (*RExC_parse != ')')
8653 FAIL("Sequence (?R) not terminated");
8654 ret = reg_node(pRExC_state, GOSTART);
8655 *flagp |= POSTPONED;
8656 nextchar(pRExC_state);
8659 { /* named and numeric backreferences */
8661 case '&': /* (?&NAME) */
8662 parse_start = RExC_parse - 1;
8665 SV *sv_dat = reg_scan_name(pRExC_state,
8666 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8667 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8669 goto gen_recurse_regop;
8670 assert(0); /* NOT REACHED */
8672 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8674 vFAIL("Illegal pattern");
8676 goto parse_recursion;
8678 case '-': /* (?-1) */
8679 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
8680 RExC_parse--; /* rewind to let it be handled later */
8684 case '1': case '2': case '3': case '4': /* (?1) */
8685 case '5': case '6': case '7': case '8': case '9':
8688 num = atoi(RExC_parse);
8689 parse_start = RExC_parse - 1; /* MJD */
8690 if (*RExC_parse == '-')
8692 while (isDIGIT(*RExC_parse))
8694 if (*RExC_parse!=')')
8695 vFAIL("Expecting close bracket");
8698 if ( paren == '-' ) {
8700 Diagram of capture buffer numbering.
8701 Top line is the normal capture buffer numbers
8702 Bottom line is the negative indexing as from
8706 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
8710 num = RExC_npar + num;
8713 vFAIL("Reference to nonexistent group");
8715 } else if ( paren == '+' ) {
8716 num = RExC_npar + num - 1;
8719 ret = reganode(pRExC_state, GOSUB, num);
8721 if (num > (I32)RExC_rx->nparens) {
8723 vFAIL("Reference to nonexistent group");
8725 ARG2L_SET( ret, RExC_recurse_count++);
8727 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8728 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
8732 RExC_seen |= REG_SEEN_RECURSE;
8733 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
8734 Set_Node_Offset(ret, parse_start); /* MJD */
8736 *flagp |= POSTPONED;
8737 nextchar(pRExC_state);
8739 } /* named and numeric backreferences */
8740 assert(0); /* NOT REACHED */
8742 case '?': /* (??...) */
8744 if (*RExC_parse != '{') {
8746 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8749 *flagp |= POSTPONED;
8750 paren = *RExC_parse++;
8752 case '{': /* (?{...}) */
8755 struct reg_code_block *cb;
8757 RExC_seen_zerolen++;
8759 if ( !pRExC_state->num_code_blocks
8760 || pRExC_state->code_index >= pRExC_state->num_code_blocks
8761 || pRExC_state->code_blocks[pRExC_state->code_index].start
8762 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
8765 if (RExC_pm_flags & PMf_USE_RE_EVAL)
8766 FAIL("panic: Sequence (?{...}): no code block found\n");
8767 FAIL("Eval-group not allowed at runtime, use re 'eval'");
8769 /* this is a pre-compiled code block (?{...}) */
8770 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
8771 RExC_parse = RExC_start + cb->end;
8774 if (cb->src_regex) {
8775 n = add_data(pRExC_state, 2, "rl");
8776 RExC_rxi->data->data[n] =
8777 (void*)SvREFCNT_inc((SV*)cb->src_regex);
8778 RExC_rxi->data->data[n+1] = (void*)o;
8781 n = add_data(pRExC_state, 1,
8782 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
8783 RExC_rxi->data->data[n] = (void*)o;
8786 pRExC_state->code_index++;
8787 nextchar(pRExC_state);
8791 ret = reg_node(pRExC_state, LOGICAL);
8792 eval = reganode(pRExC_state, EVAL, n);
8795 /* for later propagation into (??{}) return value */
8796 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
8798 REGTAIL(pRExC_state, ret, eval);
8799 /* deal with the length of this later - MJD */
8802 ret = reganode(pRExC_state, EVAL, n);
8803 Set_Node_Length(ret, RExC_parse - parse_start + 1);
8804 Set_Node_Offset(ret, parse_start);
8807 case '(': /* (?(?{...})...) and (?(?=...)...) */
8810 if (RExC_parse[0] == '?') { /* (?(?...)) */
8811 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
8812 || RExC_parse[1] == '<'
8813 || RExC_parse[1] == '{') { /* Lookahead or eval. */
8816 ret = reg_node(pRExC_state, LOGICAL);
8819 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
8823 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
8824 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
8826 char ch = RExC_parse[0] == '<' ? '>' : '\'';
8827 char *name_start= RExC_parse++;
8829 SV *sv_dat=reg_scan_name(pRExC_state,
8830 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8831 if (RExC_parse == name_start || *RExC_parse != ch)
8832 vFAIL2("Sequence (?(%c... not terminated",
8833 (ch == '>' ? '<' : ch));
8836 num = add_data( pRExC_state, 1, "S" );
8837 RExC_rxi->data->data[num]=(void*)sv_dat;
8838 SvREFCNT_inc_simple_void(sv_dat);
8840 ret = reganode(pRExC_state,NGROUPP,num);
8841 goto insert_if_check_paren;
8843 else if (RExC_parse[0] == 'D' &&
8844 RExC_parse[1] == 'E' &&
8845 RExC_parse[2] == 'F' &&
8846 RExC_parse[3] == 'I' &&
8847 RExC_parse[4] == 'N' &&
8848 RExC_parse[5] == 'E')
8850 ret = reganode(pRExC_state,DEFINEP,0);
8853 goto insert_if_check_paren;
8855 else if (RExC_parse[0] == 'R') {
8858 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8859 parno = atoi(RExC_parse++);
8860 while (isDIGIT(*RExC_parse))
8862 } else if (RExC_parse[0] == '&') {
8865 sv_dat = reg_scan_name(pRExC_state,
8866 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
8867 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
8869 ret = reganode(pRExC_state,INSUBP,parno);
8870 goto insert_if_check_paren;
8872 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
8875 parno = atoi(RExC_parse++);
8877 while (isDIGIT(*RExC_parse))
8879 ret = reganode(pRExC_state, GROUPP, parno);
8881 insert_if_check_paren:
8882 if ((c = *nextchar(pRExC_state)) != ')')
8883 vFAIL("Switch condition not recognized");
8885 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
8886 br = regbranch(pRExC_state, &flags, 1,depth+1);
8888 br = reganode(pRExC_state, LONGJMP, 0);
8890 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
8891 c = *nextchar(pRExC_state);
8896 vFAIL("(?(DEFINE)....) does not allow branches");
8897 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
8898 regbranch(pRExC_state, &flags, 1,depth+1);
8899 REGTAIL(pRExC_state, ret, lastbr);
8902 c = *nextchar(pRExC_state);
8907 vFAIL("Switch (?(condition)... contains too many branches");
8908 ender = reg_node(pRExC_state, TAIL);
8909 REGTAIL(pRExC_state, br, ender);
8911 REGTAIL(pRExC_state, lastbr, ender);
8912 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
8915 REGTAIL(pRExC_state, ret, ender);
8916 RExC_size++; /* XXX WHY do we need this?!!
8917 For large programs it seems to be required
8918 but I can't figure out why. -- dmq*/
8922 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
8926 RExC_parse--; /* for vFAIL to print correctly */
8927 vFAIL("Sequence (? incomplete");
8929 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
8931 has_use_defaults = TRUE;
8932 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
8933 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
8934 ? REGEX_UNICODE_CHARSET
8935 : REGEX_DEPENDS_CHARSET);
8939 parse_flags: /* (?i) */
8941 U32 posflags = 0, negflags = 0;
8942 U32 *flagsp = &posflags;
8943 char has_charset_modifier = '\0';
8944 regex_charset cs = get_regex_charset(RExC_flags);
8945 if (cs == REGEX_DEPENDS_CHARSET
8946 && (RExC_utf8 || RExC_uni_semantics))
8948 cs = REGEX_UNICODE_CHARSET;
8951 while (*RExC_parse) {
8952 /* && strchr("iogcmsx", *RExC_parse) */
8953 /* (?g), (?gc) and (?o) are useless here
8954 and must be globally applied -- japhy */
8955 switch (*RExC_parse) {
8956 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
8957 case LOCALE_PAT_MOD:
8958 if (has_charset_modifier) {
8959 goto excess_modifier;
8961 else if (flagsp == &negflags) {
8964 cs = REGEX_LOCALE_CHARSET;
8965 has_charset_modifier = LOCALE_PAT_MOD;
8966 RExC_contains_locale = 1;
8968 case UNICODE_PAT_MOD:
8969 if (has_charset_modifier) {
8970 goto excess_modifier;
8972 else if (flagsp == &negflags) {
8975 cs = REGEX_UNICODE_CHARSET;
8976 has_charset_modifier = UNICODE_PAT_MOD;
8978 case ASCII_RESTRICT_PAT_MOD:
8979 if (flagsp == &negflags) {
8982 if (has_charset_modifier) {
8983 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8984 goto excess_modifier;
8986 /* Doubled modifier implies more restricted */
8987 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8990 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8992 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
8994 case DEPENDS_PAT_MOD:
8995 if (has_use_defaults) {
8996 goto fail_modifiers;
8998 else if (flagsp == &negflags) {
9001 else if (has_charset_modifier) {
9002 goto excess_modifier;
9005 /* The dual charset means unicode semantics if the
9006 * pattern (or target, not known until runtime) are
9007 * utf8, or something in the pattern indicates unicode
9009 cs = (RExC_utf8 || RExC_uni_semantics)
9010 ? REGEX_UNICODE_CHARSET
9011 : REGEX_DEPENDS_CHARSET;
9012 has_charset_modifier = DEPENDS_PAT_MOD;
9016 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9017 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9019 else if (has_charset_modifier == *(RExC_parse - 1)) {
9020 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
9023 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9028 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
9030 case ONCE_PAT_MOD: /* 'o' */
9031 case GLOBAL_PAT_MOD: /* 'g' */
9032 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9033 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9034 if (! (wastedflags & wflagbit) ) {
9035 wastedflags |= wflagbit;
9038 "Useless (%s%c) - %suse /%c modifier",
9039 flagsp == &negflags ? "?-" : "?",
9041 flagsp == &negflags ? "don't " : "",
9048 case CONTINUE_PAT_MOD: /* 'c' */
9049 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9050 if (! (wastedflags & WASTED_C) ) {
9051 wastedflags |= WASTED_GC;
9054 "Useless (%sc) - %suse /gc modifier",
9055 flagsp == &negflags ? "?-" : "?",
9056 flagsp == &negflags ? "don't " : ""
9061 case KEEPCOPY_PAT_MOD: /* 'p' */
9062 if (flagsp == &negflags) {
9064 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9066 *flagsp |= RXf_PMf_KEEPCOPY;
9070 /* A flag is a default iff it is following a minus, so
9071 * if there is a minus, it means will be trying to
9072 * re-specify a default which is an error */
9073 if (has_use_defaults || flagsp == &negflags) {
9076 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9080 wastedflags = 0; /* reset so (?g-c) warns twice */
9086 RExC_flags |= posflags;
9087 RExC_flags &= ~negflags;
9088 set_regex_charset(&RExC_flags, cs);
9090 oregflags |= posflags;
9091 oregflags &= ~negflags;
9092 set_regex_charset(&oregflags, cs);
9094 nextchar(pRExC_state);
9105 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
9110 }} /* one for the default block, one for the switch */
9117 ret = reganode(pRExC_state, OPEN, parno);
9120 RExC_nestroot = parno;
9121 if (RExC_seen & REG_SEEN_RECURSE
9122 && !RExC_open_parens[parno-1])
9124 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9125 "Setting open paren #%"IVdf" to %d\n",
9126 (IV)parno, REG_NODE_NUM(ret)));
9127 RExC_open_parens[parno-1]= ret;
9130 Set_Node_Length(ret, 1); /* MJD */
9131 Set_Node_Offset(ret, RExC_parse); /* MJD */
9139 /* Pick up the branches, linking them together. */
9140 parse_start = RExC_parse; /* MJD */
9141 br = regbranch(pRExC_state, &flags, 1,depth+1);
9143 /* branch_len = (paren != 0); */
9147 if (*RExC_parse == '|') {
9148 if (!SIZE_ONLY && RExC_extralen) {
9149 reginsert(pRExC_state, BRANCHJ, br, depth+1);
9152 reginsert(pRExC_state, BRANCH, br, depth+1);
9153 Set_Node_Length(br, paren != 0);
9154 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
9158 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
9160 else if (paren == ':') {
9161 *flagp |= flags&SIMPLE;
9163 if (is_open) { /* Starts with OPEN. */
9164 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
9166 else if (paren != '?') /* Not Conditional */
9168 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9170 while (*RExC_parse == '|') {
9171 if (!SIZE_ONLY && RExC_extralen) {
9172 ender = reganode(pRExC_state, LONGJMP,0);
9173 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
9176 RExC_extralen += 2; /* Account for LONGJMP. */
9177 nextchar(pRExC_state);
9179 if (RExC_npar > after_freeze)
9180 after_freeze = RExC_npar;
9181 RExC_npar = freeze_paren;
9183 br = regbranch(pRExC_state, &flags, 0, depth+1);
9187 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
9189 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
9192 if (have_branch || paren != ':') {
9193 /* Make a closing node, and hook it on the end. */
9196 ender = reg_node(pRExC_state, TAIL);
9199 ender = reganode(pRExC_state, CLOSE, parno);
9200 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
9201 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9202 "Setting close paren #%"IVdf" to %d\n",
9203 (IV)parno, REG_NODE_NUM(ender)));
9204 RExC_close_parens[parno-1]= ender;
9205 if (RExC_nestroot == parno)
9208 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
9209 Set_Node_Length(ender,1); /* MJD */
9215 *flagp &= ~HASWIDTH;
9218 ender = reg_node(pRExC_state, SUCCEED);
9221 ender = reg_node(pRExC_state, END);
9223 assert(!RExC_opend); /* there can only be one! */
9228 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9229 SV * const mysv_val1=sv_newmortal();
9230 SV * const mysv_val2=sv_newmortal();
9231 DEBUG_PARSE_MSG("lsbr");
9232 regprop(RExC_rx, mysv_val1, lastbr);
9233 regprop(RExC_rx, mysv_val2, ender);
9234 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9235 SvPV_nolen_const(mysv_val1),
9236 (IV)REG_NODE_NUM(lastbr),
9237 SvPV_nolen_const(mysv_val2),
9238 (IV)REG_NODE_NUM(ender),
9239 (IV)(ender - lastbr)
9242 REGTAIL(pRExC_state, lastbr, ender);
9244 if (have_branch && !SIZE_ONLY) {
9247 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
9249 /* Hook the tails of the branches to the closing node. */
9250 for (br = ret; br; br = regnext(br)) {
9251 const U8 op = PL_regkind[OP(br)];
9253 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9254 if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
9257 else if (op == BRANCHJ) {
9258 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9259 /* for now we always disable this optimisation * /
9260 if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
9266 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
9267 DEBUG_PARSE_r(if (!SIZE_ONLY) {
9268 SV * const mysv_val1=sv_newmortal();
9269 SV * const mysv_val2=sv_newmortal();
9270 DEBUG_PARSE_MSG("NADA");
9271 regprop(RExC_rx, mysv_val1, ret);
9272 regprop(RExC_rx, mysv_val2, ender);
9273 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
9274 SvPV_nolen_const(mysv_val1),
9275 (IV)REG_NODE_NUM(ret),
9276 SvPV_nolen_const(mysv_val2),
9277 (IV)REG_NODE_NUM(ender),
9282 if (OP(ender) == TAIL) {
9287 for ( opt= br + 1; opt < ender ; opt++ )
9289 NEXT_OFF(br)= ender - br;
9297 static const char parens[] = "=!<,>";
9299 if (paren && (p = strchr(parens, paren))) {
9300 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
9301 int flag = (p - parens) > 1;
9304 node = SUSPEND, flag = 0;
9305 reginsert(pRExC_state, node,ret, depth+1);
9306 Set_Node_Cur_Length(ret);
9307 Set_Node_Offset(ret, parse_start + 1);
9309 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
9313 /* Check for proper termination. */
9315 RExC_flags = oregflags;
9316 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
9317 RExC_parse = oregcomp_parse;
9318 vFAIL("Unmatched (");
9321 else if (!paren && RExC_parse < RExC_end) {
9322 if (*RExC_parse == ')') {
9324 vFAIL("Unmatched )");
9327 FAIL("Junk on end of regexp"); /* "Can't happen". */
9328 assert(0); /* NOTREACHED */
9331 if (RExC_in_lookbehind) {
9332 RExC_in_lookbehind--;
9334 if (after_freeze > RExC_npar)
9335 RExC_npar = after_freeze;
9340 - regbranch - one alternative of an | operator
9342 * Implements the concatenation operator.
9345 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
9349 regnode *chain = NULL;
9351 I32 flags = 0, c = 0;
9352 GET_RE_DEBUG_FLAGS_DECL;
9354 PERL_ARGS_ASSERT_REGBRANCH;
9356 DEBUG_PARSE("brnc");
9361 if (!SIZE_ONLY && RExC_extralen)
9362 ret = reganode(pRExC_state, BRANCHJ,0);
9364 ret = reg_node(pRExC_state, BRANCH);
9365 Set_Node_Length(ret, 1);
9369 if (!first && SIZE_ONLY)
9370 RExC_extralen += 1; /* BRANCHJ */
9372 *flagp = WORST; /* Tentatively. */
9375 nextchar(pRExC_state);
9376 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
9378 latest = regpiece(pRExC_state, &flags,depth+1);
9379 if (latest == NULL) {
9380 if (flags & TRYAGAIN)
9384 else if (ret == NULL)
9386 *flagp |= flags&(HASWIDTH|POSTPONED);
9387 if (chain == NULL) /* First piece. */
9388 *flagp |= flags&SPSTART;
9391 REGTAIL(pRExC_state, chain, latest);
9396 if (chain == NULL) { /* Loop ran zero times. */
9397 chain = reg_node(pRExC_state, NOTHING);
9402 *flagp |= flags&SIMPLE;
9409 - regpiece - something followed by possible [*+?]
9411 * Note that the branching code sequences used for ? and the general cases
9412 * of * and + are somewhat optimized: they use the same NOTHING node as
9413 * both the endmarker for their branch list and the body of the last branch.
9414 * It might seem that this node could be dispensed with entirely, but the
9415 * endmarker role is not redundant.
9418 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
9425 const char * const origparse = RExC_parse;
9427 I32 max = REG_INFTY;
9428 #ifdef RE_TRACK_PATTERN_OFFSETS
9431 const char *maxpos = NULL;
9433 /* Save the original in case we change the emitted regop to a FAIL. */
9434 regnode * const orig_emit = RExC_emit;
9436 GET_RE_DEBUG_FLAGS_DECL;
9438 PERL_ARGS_ASSERT_REGPIECE;
9440 DEBUG_PARSE("piec");
9442 ret = regatom(pRExC_state, &flags,depth+1);
9444 if (flags & TRYAGAIN)
9451 if (op == '{' && regcurly(RExC_parse)) {
9453 #ifdef RE_TRACK_PATTERN_OFFSETS
9454 parse_start = RExC_parse; /* MJD */
9456 next = RExC_parse + 1;
9457 while (isDIGIT(*next) || *next == ',') {
9466 if (*next == '}') { /* got one */
9470 min = atoi(RExC_parse);
9474 maxpos = RExC_parse;
9476 if (!max && *maxpos != '0')
9477 max = REG_INFTY; /* meaning "infinity" */
9478 else if (max >= REG_INFTY)
9479 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
9481 nextchar(pRExC_state);
9482 if (max < min) { /* If can't match, warn and optimize to fail
9485 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
9487 /* We can't back off the size because we have to reserve
9488 * enough space for all the things we are about to throw
9489 * away, but we can shrink it by the ammount we are about
9491 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
9494 RExC_emit = orig_emit;
9496 ret = reg_node(pRExC_state, OPFAIL);
9501 if ((flags&SIMPLE)) {
9502 RExC_naughty += 2 + RExC_naughty / 2;
9503 reginsert(pRExC_state, CURLY, ret, depth+1);
9504 Set_Node_Offset(ret, parse_start+1); /* MJD */
9505 Set_Node_Cur_Length(ret);
9508 regnode * const w = reg_node(pRExC_state, WHILEM);
9511 REGTAIL(pRExC_state, ret, w);
9512 if (!SIZE_ONLY && RExC_extralen) {
9513 reginsert(pRExC_state, LONGJMP,ret, depth+1);
9514 reginsert(pRExC_state, NOTHING,ret, depth+1);
9515 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
9517 reginsert(pRExC_state, CURLYX,ret, depth+1);
9519 Set_Node_Offset(ret, parse_start+1);
9520 Set_Node_Length(ret,
9521 op == '{' ? (RExC_parse - parse_start) : 1);
9523 if (!SIZE_ONLY && RExC_extralen)
9524 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
9525 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
9527 RExC_whilem_seen++, RExC_extralen += 3;
9528 RExC_naughty += 4 + RExC_naughty; /* compound interest */
9537 ARG1_SET(ret, (U16)min);
9538 ARG2_SET(ret, (U16)max);
9550 #if 0 /* Now runtime fix should be reliable. */
9552 /* if this is reinstated, don't forget to put this back into perldiag:
9554 =item Regexp *+ operand could be empty at {#} in regex m/%s/
9556 (F) The part of the regexp subject to either the * or + quantifier
9557 could match an empty string. The {#} shows in the regular
9558 expression about where the problem was discovered.
9562 if (!(flags&HASWIDTH) && op != '?')
9563 vFAIL("Regexp *+ operand could be empty");
9566 #ifdef RE_TRACK_PATTERN_OFFSETS
9567 parse_start = RExC_parse;
9569 nextchar(pRExC_state);
9571 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
9573 if (op == '*' && (flags&SIMPLE)) {
9574 reginsert(pRExC_state, STAR, ret, depth+1);
9578 else if (op == '*') {
9582 else if (op == '+' && (flags&SIMPLE)) {
9583 reginsert(pRExC_state, PLUS, ret, depth+1);
9587 else if (op == '+') {
9591 else if (op == '?') {
9596 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
9597 ckWARN3reg(RExC_parse,
9598 "%.*s matches null string many times",
9599 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
9603 if (RExC_parse < RExC_end && *RExC_parse == '?') {
9604 nextchar(pRExC_state);
9605 reginsert(pRExC_state, MINMOD, ret, depth+1);
9606 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
9608 #ifndef REG_ALLOW_MINMOD_SUSPEND
9611 if (RExC_parse < RExC_end && *RExC_parse == '+') {
9613 nextchar(pRExC_state);
9614 ender = reg_node(pRExC_state, SUCCEED);
9615 REGTAIL(pRExC_state, ret, ender);
9616 reginsert(pRExC_state, SUSPEND, ret, depth+1);
9618 ender = reg_node(pRExC_state, TAIL);
9619 REGTAIL(pRExC_state, ret, ender);
9623 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
9625 vFAIL("Nested quantifiers");
9632 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
9635 /* This is expected to be called by a parser routine that has recognized '\N'
9636 and needs to handle the rest. RExC_parse is expected to point at the first
9637 char following the N at the time of the call. On successful return,
9638 RExC_parse has been updated to point to just after the sequence identified
9639 by this routine, and <*flagp> has been updated.
9641 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
9644 \N may begin either a named sequence, or if outside a character class, mean
9645 to match a non-newline. For non single-quoted regexes, the tokenizer has
9646 attempted to decide which, and in the case of a named sequence, converted it
9647 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
9648 where c1... are the characters in the sequence. For single-quoted regexes,
9649 the tokenizer passes the \N sequence through unchanged; this code will not
9650 attempt to determine this nor expand those, instead raising a syntax error.
9651 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
9652 or there is no '}', it signals that this \N occurrence means to match a
9655 Only the \N{U+...} form should occur in a character class, for the same
9656 reason that '.' inside a character class means to just match a period: it
9657 just doesn't make sense.
9659 The function raises an error (via vFAIL), and doesn't return for various
9660 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
9661 success; it returns FALSE otherwise.
9663 If <valuep> is non-null, it means the caller can accept an input sequence
9664 consisting of a just a single code point; <*valuep> is set to that value
9665 if the input is such.
9667 If <node_p> is non-null it signifies that the caller can accept any other
9668 legal sequence (i.e., one that isn't just a single code point). <*node_p>
9670 1) \N means not-a-NL: points to a newly created REG_ANY node;
9671 2) \N{}: points to a new NOTHING node;
9672 3) otherwise: points to a new EXACT node containing the resolved
9674 Note that FALSE is returned for single code point sequences if <valuep> is
9678 char * endbrace; /* '}' following the name */
9680 char *endchar; /* Points to '.' or '}' ending cur char in the input
9682 bool has_multiple_chars; /* true if the input stream contains a sequence of
9683 more than one character */
9685 GET_RE_DEBUG_FLAGS_DECL;
9687 PERL_ARGS_ASSERT_GROK_BSLASH_N;
9691 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
9693 /* The [^\n] meaning of \N ignores spaces and comments under the /x
9694 * modifier. The other meaning does not */
9695 p = (RExC_flags & RXf_PMf_EXTENDED)
9696 ? regwhite( pRExC_state, RExC_parse )
9699 /* Disambiguate between \N meaning a named character versus \N meaning
9700 * [^\n]. The former is assumed when it can't be the latter. */
9701 if (*p != '{' || regcurly(p)) {
9704 /* no bare \N in a charclass */
9705 if (in_char_class) {
9706 vFAIL("\\N in a character class must be a named character: \\N{...}");
9710 nextchar(pRExC_state);
9711 *node_p = reg_node(pRExC_state, REG_ANY);
9712 *flagp |= HASWIDTH|SIMPLE;
9715 Set_Node_Length(*node_p, 1); /* MJD */
9719 /* Here, we have decided it should be a named character or sequence */
9721 /* The test above made sure that the next real character is a '{', but
9722 * under the /x modifier, it could be separated by space (or a comment and
9723 * \n) and this is not allowed (for consistency with \x{...} and the
9724 * tokenizer handling of \N{NAME}). */
9725 if (*RExC_parse != '{') {
9726 vFAIL("Missing braces on \\N{}");
9729 RExC_parse++; /* Skip past the '{' */
9731 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
9732 || ! (endbrace == RExC_parse /* nothing between the {} */
9733 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
9734 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
9736 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
9737 vFAIL("\\N{NAME} must be resolved by the lexer");
9740 if (endbrace == RExC_parse) { /* empty: \N{} */
9743 *node_p = reg_node(pRExC_state,NOTHING);
9745 else if (in_char_class) {
9746 if (SIZE_ONLY && in_char_class) {
9747 ckWARNreg(RExC_parse,
9748 "Ignoring zero length \\N{} in character class"
9756 nextchar(pRExC_state);
9760 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
9761 RExC_parse += 2; /* Skip past the 'U+' */
9763 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9765 /* Code points are separated by dots. If none, there is only one code
9766 * point, and is terminated by the brace */
9767 has_multiple_chars = (endchar < endbrace);
9769 if (valuep && (! has_multiple_chars || in_char_class)) {
9770 /* We only pay attention to the first char of
9771 multichar strings being returned in char classes. I kinda wonder
9772 if this makes sense as it does change the behaviour
9773 from earlier versions, OTOH that behaviour was broken
9774 as well. XXX Solution is to recharacterize as
9775 [rest-of-class]|multi1|multi2... */
9777 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
9778 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
9779 | PERL_SCAN_DISALLOW_PREFIX
9780 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
9782 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
9784 /* The tokenizer should have guaranteed validity, but it's possible to
9785 * bypass it by using single quoting, so check */
9786 if (length_of_hex == 0
9787 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
9789 RExC_parse += length_of_hex; /* Includes all the valid */
9790 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
9791 ? UTF8SKIP(RExC_parse)
9793 /* Guard against malformed utf8 */
9794 if (RExC_parse >= endchar) {
9795 RExC_parse = endchar;
9797 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9800 if (in_char_class && has_multiple_chars) {
9801 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
9804 RExC_parse = endbrace + 1;
9806 else if (! node_p || ! has_multiple_chars) {
9808 /* Here, the input is legal, but not according to the caller's
9809 * options. We fail without advancing the parse, so that the
9810 * caller can try again */
9816 /* What is done here is to convert this to a sub-pattern of the form
9817 * (?:\x{char1}\x{char2}...)
9818 * and then call reg recursively. That way, it retains its atomicness,
9819 * while not having to worry about special handling that some code
9820 * points may have. toke.c has converted the original Unicode values
9821 * to native, so that we can just pass on the hex values unchanged. We
9822 * do have to set a flag to keep recoding from happening in the
9825 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
9827 char *orig_end = RExC_end;
9830 while (RExC_parse < endbrace) {
9832 /* Convert to notation the rest of the code understands */
9833 sv_catpv(substitute_parse, "\\x{");
9834 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
9835 sv_catpv(substitute_parse, "}");
9837 /* Point to the beginning of the next character in the sequence. */
9838 RExC_parse = endchar + 1;
9839 endchar = RExC_parse + strcspn(RExC_parse, ".}");
9841 sv_catpv(substitute_parse, ")");
9843 RExC_parse = SvPV(substitute_parse, len);
9845 /* Don't allow empty number */
9847 vFAIL("Invalid hexadecimal number in \\N{U+...}");
9849 RExC_end = RExC_parse + len;
9851 /* The values are Unicode, and therefore not subject to recoding */
9852 RExC_override_recoding = 1;
9854 *node_p = reg(pRExC_state, 1, &flags, depth+1);
9855 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
9857 RExC_parse = endbrace;
9858 RExC_end = orig_end;
9859 RExC_override_recoding = 0;
9861 nextchar(pRExC_state);
9871 * It returns the code point in utf8 for the value in *encp.
9872 * value: a code value in the source encoding
9873 * encp: a pointer to an Encode object
9875 * If the result from Encode is not a single character,
9876 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
9879 S_reg_recode(pTHX_ const char value, SV **encp)
9882 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
9883 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9884 const STRLEN newlen = SvCUR(sv);
9885 UV uv = UNICODE_REPLACEMENT;
9887 PERL_ARGS_ASSERT_REG_RECODE;
9891 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
9894 if (!newlen || numlen != newlen) {
9895 uv = UNICODE_REPLACEMENT;
9901 PERL_STATIC_INLINE U8
9902 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
9906 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
9912 op = get_regex_charset(RExC_flags);
9913 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
9914 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
9915 been, so there is no hole */
9921 PERL_STATIC_INLINE void
9922 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
9924 /* This knows the details about sizing an EXACTish node, setting flags for
9925 * it (by setting <*flagp>, and potentially populating it with a single
9928 * If <len> (the length in bytes) is non-zero, this function assumes that
9929 * the node has already been populated, and just does the sizing. In this
9930 * case <code_point> should be the final code point that has already been
9931 * placed into the node. This value will be ignored except that under some
9932 * circumstances <*flagp> is set based on it.
9934 * If <len> is zero, the function assumes that the node is to contain only
9935 * the single character given by <code_point> and calculates what <len>
9936 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
9937 * additionally will populate the node's STRING with <code_point>, if <len>
9938 * is 0. In both cases <*flagp> is appropriately set
9940 * It knows that under FOLD, UTF characters and the Latin Sharp S must be
9941 * folded (the latter only when the rules indicate it can match 'ss') */
9943 bool len_passed_in = cBOOL(len != 0);
9944 U8 character[UTF8_MAXBYTES_CASE+1];
9946 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
9948 if (! len_passed_in) {
9951 to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
9954 uvchr_to_utf8( character, code_point);
9955 len = UTF8SKIP(character);
9959 || code_point != LATIN_SMALL_LETTER_SHARP_S
9960 || ASCII_FOLD_RESTRICTED
9961 || ! AT_LEAST_UNI_SEMANTICS)
9963 *character = (U8) code_point;
9968 *(character + 1) = 's';
9974 RExC_size += STR_SZ(len);
9977 RExC_emit += STR_SZ(len);
9978 STR_LEN(node) = len;
9979 if (! len_passed_in) {
9980 Copy((char *) character, STRING(node), len, char);
9986 /* A single character node is SIMPLE, except for the special-cased SHARP S
9988 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
9989 && (code_point != LATIN_SMALL_LETTER_SHARP_S
9990 || ! FOLD || ! DEPENDS_SEMANTICS))
9997 - regatom - the lowest level
9999 Try to identify anything special at the start of the pattern. If there
10000 is, then handle it as required. This may involve generating a single regop,
10001 such as for an assertion; or it may involve recursing, such as to
10002 handle a () structure.
10004 If the string doesn't start with something special then we gobble up
10005 as much literal text as we can.
10007 Once we have been able to handle whatever type of thing started the
10008 sequence, we return.
10010 Note: we have to be careful with escapes, as they can be both literal
10011 and special, and in the case of \10 and friends, context determines which.
10013 A summary of the code structure is:
10015 switch (first_byte) {
10016 cases for each special:
10017 handle this special;
10020 switch (2nd byte) {
10021 cases for each unambiguous special:
10022 handle this special;
10024 cases for each ambigous special/literal:
10026 if (special) handle here
10028 default: // unambiguously literal:
10031 default: // is a literal char
10034 create EXACTish node for literal;
10035 while (more input and node isn't full) {
10036 switch (input_byte) {
10037 cases for each special;
10038 make sure parse pointer is set so that the next call to
10039 regatom will see this special first
10040 goto loopdone; // EXACTish node terminated by prev. char
10042 append char to EXACTISH node;
10044 get next input byte;
10048 return the generated node;
10050 Specifically there are two separate switches for handling
10051 escape sequences, with the one for handling literal escapes requiring
10052 a dummy entry for all of the special escapes that are actually handled
10057 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10060 regnode *ret = NULL;
10062 char *parse_start = RExC_parse;
10064 GET_RE_DEBUG_FLAGS_DECL;
10065 DEBUG_PARSE("atom");
10066 *flagp = WORST; /* Tentatively. */
10068 PERL_ARGS_ASSERT_REGATOM;
10071 switch ((U8)*RExC_parse) {
10073 RExC_seen_zerolen++;
10074 nextchar(pRExC_state);
10075 if (RExC_flags & RXf_PMf_MULTILINE)
10076 ret = reg_node(pRExC_state, MBOL);
10077 else if (RExC_flags & RXf_PMf_SINGLELINE)
10078 ret = reg_node(pRExC_state, SBOL);
10080 ret = reg_node(pRExC_state, BOL);
10081 Set_Node_Length(ret, 1); /* MJD */
10084 nextchar(pRExC_state);
10086 RExC_seen_zerolen++;
10087 if (RExC_flags & RXf_PMf_MULTILINE)
10088 ret = reg_node(pRExC_state, MEOL);
10089 else if (RExC_flags & RXf_PMf_SINGLELINE)
10090 ret = reg_node(pRExC_state, SEOL);
10092 ret = reg_node(pRExC_state, EOL);
10093 Set_Node_Length(ret, 1); /* MJD */
10096 nextchar(pRExC_state);
10097 if (RExC_flags & RXf_PMf_SINGLELINE)
10098 ret = reg_node(pRExC_state, SANY);
10100 ret = reg_node(pRExC_state, REG_ANY);
10101 *flagp |= HASWIDTH|SIMPLE;
10103 Set_Node_Length(ret, 1); /* MJD */
10107 char * const oregcomp_parse = ++RExC_parse;
10108 ret = regclass(pRExC_state, flagp,depth+1);
10109 if (*RExC_parse != ']') {
10110 RExC_parse = oregcomp_parse;
10111 vFAIL("Unmatched [");
10113 nextchar(pRExC_state);
10114 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
10118 nextchar(pRExC_state);
10119 ret = reg(pRExC_state, 1, &flags,depth+1);
10121 if (flags & TRYAGAIN) {
10122 if (RExC_parse == RExC_end) {
10123 /* Make parent create an empty node if needed. */
10124 *flagp |= TRYAGAIN;
10131 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10135 if (flags & TRYAGAIN) {
10136 *flagp |= TRYAGAIN;
10139 vFAIL("Internal urp");
10140 /* Supposed to be caught earlier. */
10146 vFAIL("Quantifier follows nothing");
10151 This switch handles escape sequences that resolve to some kind
10152 of special regop and not to literal text. Escape sequnces that
10153 resolve to literal text are handled below in the switch marked
10156 Every entry in this switch *must* have a corresponding entry
10157 in the literal escape switch. However, the opposite is not
10158 required, as the default for this switch is to jump to the
10159 literal text handling code.
10161 switch ((U8)*++RExC_parse) {
10162 /* Special Escapes */
10164 RExC_seen_zerolen++;
10165 ret = reg_node(pRExC_state, SBOL);
10167 goto finish_meta_pat;
10169 ret = reg_node(pRExC_state, GPOS);
10170 RExC_seen |= REG_SEEN_GPOS;
10172 goto finish_meta_pat;
10174 RExC_seen_zerolen++;
10175 ret = reg_node(pRExC_state, KEEPS);
10177 /* XXX:dmq : disabling in-place substitution seems to
10178 * be necessary here to avoid cases of memory corruption, as
10179 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
10181 RExC_seen |= REG_SEEN_LOOKBEHIND;
10182 goto finish_meta_pat;
10184 ret = reg_node(pRExC_state, SEOL);
10186 RExC_seen_zerolen++; /* Do not optimize RE away */
10187 goto finish_meta_pat;
10189 ret = reg_node(pRExC_state, EOS);
10191 RExC_seen_zerolen++; /* Do not optimize RE away */
10192 goto finish_meta_pat;
10194 ret = reg_node(pRExC_state, CANY);
10195 RExC_seen |= REG_SEEN_CANY;
10196 *flagp |= HASWIDTH|SIMPLE;
10197 goto finish_meta_pat;
10199 ret = reg_node(pRExC_state, CLUMP);
10200 *flagp |= HASWIDTH;
10201 goto finish_meta_pat;
10203 op = ALNUM + get_regex_charset(RExC_flags);
10204 if (op > ALNUMA) { /* /aa is same as /a */
10207 ret = reg_node(pRExC_state, op);
10208 *flagp |= HASWIDTH|SIMPLE;
10209 goto finish_meta_pat;
10211 op = NALNUM + get_regex_charset(RExC_flags);
10212 if (op > NALNUMA) { /* /aa is same as /a */
10215 ret = reg_node(pRExC_state, op);
10216 *flagp |= HASWIDTH|SIMPLE;
10217 goto finish_meta_pat;
10219 RExC_seen_zerolen++;
10220 RExC_seen |= REG_SEEN_LOOKBEHIND;
10221 op = BOUND + get_regex_charset(RExC_flags);
10222 if (op > BOUNDA) { /* /aa is same as /a */
10225 ret = reg_node(pRExC_state, op);
10226 FLAGS(ret) = get_regex_charset(RExC_flags);
10228 goto finish_meta_pat;
10230 RExC_seen_zerolen++;
10231 RExC_seen |= REG_SEEN_LOOKBEHIND;
10232 op = NBOUND + get_regex_charset(RExC_flags);
10233 if (op > NBOUNDA) { /* /aa is same as /a */
10236 ret = reg_node(pRExC_state, op);
10237 FLAGS(ret) = get_regex_charset(RExC_flags);
10239 goto finish_meta_pat;
10241 op = SPACE + get_regex_charset(RExC_flags);
10242 if (op > SPACEA) { /* /aa is same as /a */
10245 ret = reg_node(pRExC_state, op);
10246 *flagp |= HASWIDTH|SIMPLE;
10247 goto finish_meta_pat;
10249 op = NSPACE + get_regex_charset(RExC_flags);
10250 if (op > NSPACEA) { /* /aa is same as /a */
10253 ret = reg_node(pRExC_state, op);
10254 *flagp |= HASWIDTH|SIMPLE;
10255 goto finish_meta_pat;
10263 U8 offset = get_regex_charset(RExC_flags);
10264 if (offset == REGEX_UNICODE_CHARSET) {
10265 offset = REGEX_DEPENDS_CHARSET;
10267 else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
10268 offset = REGEX_ASCII_RESTRICTED_CHARSET;
10272 ret = reg_node(pRExC_state, op);
10273 *flagp |= HASWIDTH|SIMPLE;
10274 goto finish_meta_pat;
10276 ret = reg_node(pRExC_state, LNBREAK);
10277 *flagp |= HASWIDTH;
10278 goto finish_meta_pat;
10280 ret = reg_node(pRExC_state, HORIZWS);
10281 *flagp |= HASWIDTH|SIMPLE;
10282 goto finish_meta_pat;
10284 ret = reg_node(pRExC_state, NHORIZWS);
10285 *flagp |= HASWIDTH|SIMPLE;
10286 goto finish_meta_pat;
10288 ret = reg_node(pRExC_state, VERTWS);
10289 *flagp |= HASWIDTH|SIMPLE;
10290 goto finish_meta_pat;
10292 ret = reg_node(pRExC_state, NVERTWS);
10293 *flagp |= HASWIDTH|SIMPLE;
10295 nextchar(pRExC_state);
10296 Set_Node_Length(ret, 2); /* MJD */
10301 char* const oldregxend = RExC_end;
10303 char* parse_start = RExC_parse - 2;
10306 if (RExC_parse[1] == '{') {
10307 /* a lovely hack--pretend we saw [\pX] instead */
10308 RExC_end = strchr(RExC_parse, '}');
10310 const U8 c = (U8)*RExC_parse;
10312 RExC_end = oldregxend;
10313 vFAIL2("Missing right brace on \\%c{}", c);
10318 RExC_end = RExC_parse + 2;
10319 if (RExC_end > oldregxend)
10320 RExC_end = oldregxend;
10324 ret = regclass(pRExC_state, flagp,depth+1);
10326 RExC_end = oldregxend;
10329 Set_Node_Offset(ret, parse_start + 2);
10330 Set_Node_Cur_Length(ret);
10331 nextchar(pRExC_state);
10335 /* Handle \N and \N{NAME} with multiple code points here and not
10336 * below because it can be multicharacter. join_exact() will join
10337 * them up later on. Also this makes sure that things like
10338 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
10339 * The options to the grok function call causes it to fail if the
10340 * sequence is just a single code point. We then go treat it as
10341 * just another character in the current EXACT node, and hence it
10342 * gets uniform treatment with all the other characters. The
10343 * special treatment for quantifiers is not needed for such single
10344 * character sequences */
10346 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
10351 case 'k': /* Handle \k<NAME> and \k'NAME' */
10354 char ch= RExC_parse[1];
10355 if (ch != '<' && ch != '\'' && ch != '{') {
10357 vFAIL2("Sequence %.2s... not terminated",parse_start);
10359 /* this pretty much dupes the code for (?P=...) in reg(), if
10360 you change this make sure you change that */
10361 char* name_start = (RExC_parse += 2);
10363 SV *sv_dat = reg_scan_name(pRExC_state,
10364 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10365 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
10366 if (RExC_parse == name_start || *RExC_parse != ch)
10367 vFAIL2("Sequence %.3s... not terminated",parse_start);
10370 num = add_data( pRExC_state, 1, "S" );
10371 RExC_rxi->data->data[num]=(void*)sv_dat;
10372 SvREFCNT_inc_simple_void(sv_dat);
10376 ret = reganode(pRExC_state,
10379 : (ASCII_FOLD_RESTRICTED)
10381 : (AT_LEAST_UNI_SEMANTICS)
10387 *flagp |= HASWIDTH;
10389 /* override incorrect value set in reganode MJD */
10390 Set_Node_Offset(ret, parse_start+1);
10391 Set_Node_Cur_Length(ret); /* MJD */
10392 nextchar(pRExC_state);
10398 case '1': case '2': case '3': case '4':
10399 case '5': case '6': case '7': case '8': case '9':
10402 bool isg = *RExC_parse == 'g';
10407 if (*RExC_parse == '{') {
10411 if (*RExC_parse == '-') {
10415 if (hasbrace && !isDIGIT(*RExC_parse)) {
10416 if (isrel) RExC_parse--;
10418 goto parse_named_seq;
10420 num = atoi(RExC_parse);
10421 if (isg && num == 0)
10422 vFAIL("Reference to invalid group 0");
10424 num = RExC_npar - num;
10426 vFAIL("Reference to nonexistent or unclosed group");
10428 if (!isg && num > 9 && num >= RExC_npar)
10429 /* Probably a character specified in octal, e.g. \35 */
10432 char * const parse_start = RExC_parse - 1; /* MJD */
10433 while (isDIGIT(*RExC_parse))
10435 if (parse_start == RExC_parse - 1)
10436 vFAIL("Unterminated \\g... pattern");
10438 if (*RExC_parse != '}')
10439 vFAIL("Unterminated \\g{...} pattern");
10443 if (num > (I32)RExC_rx->nparens)
10444 vFAIL("Reference to nonexistent group");
10447 ret = reganode(pRExC_state,
10450 : (ASCII_FOLD_RESTRICTED)
10452 : (AT_LEAST_UNI_SEMANTICS)
10458 *flagp |= HASWIDTH;
10460 /* override incorrect value set in reganode MJD */
10461 Set_Node_Offset(ret, parse_start+1);
10462 Set_Node_Cur_Length(ret); /* MJD */
10464 nextchar(pRExC_state);
10469 if (RExC_parse >= RExC_end)
10470 FAIL("Trailing \\");
10473 /* Do not generate "unrecognized" warnings here, we fall
10474 back into the quick-grab loop below */
10481 if (RExC_flags & RXf_PMf_EXTENDED) {
10482 if ( reg_skipcomment( pRExC_state ) )
10489 parse_start = RExC_parse - 1;
10498 #define MAX_NODE_STRING_SIZE 127
10499 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
10501 U8 upper_parse = MAX_NODE_STRING_SIZE;
10504 bool next_is_quantifier;
10505 char * oldp = NULL;
10507 /* If a folding node contains only code points that don't
10508 * participate in folds, it can be changed into an EXACT node,
10509 * which allows the optimizer more things to look for */
10513 node_type = compute_EXACTish(pRExC_state);
10514 ret = reg_node(pRExC_state, node_type);
10516 /* In pass1, folded, we use a temporary buffer instead of the
10517 * actual node, as the node doesn't exist yet */
10518 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
10524 /* We do the EXACTFish to EXACT node only if folding, and not if in
10525 * locale, as whether a character folds or not isn't known until
10527 maybe_exact = FOLD && ! LOC;
10529 /* XXX The node can hold up to 255 bytes, yet this only goes to
10530 * 127. I (khw) do not know why. Keeping it somewhat less than
10531 * 255 allows us to not have to worry about overflow due to
10532 * converting to utf8 and fold expansion, but that value is
10533 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
10534 * split up by this limit into a single one using the real max of
10535 * 255. Even at 127, this breaks under rare circumstances. If
10536 * folding, we do not want to split a node at a character that is a
10537 * non-final in a multi-char fold, as an input string could just
10538 * happen to want to match across the node boundary. The join
10539 * would solve that problem if the join actually happens. But a
10540 * series of more than two nodes in a row each of 127 would cause
10541 * the first join to succeed to get to 254, but then there wouldn't
10542 * be room for the next one, which could at be one of those split
10543 * multi-char folds. I don't know of any fool-proof solution. One
10544 * could back off to end with only a code point that isn't such a
10545 * non-final, but it is possible for there not to be any in the
10547 for (p = RExC_parse - 1;
10548 len < upper_parse && p < RExC_end;
10553 if (RExC_flags & RXf_PMf_EXTENDED)
10554 p = regwhite( pRExC_state, p );
10565 /* Literal Escapes Switch
10567 This switch is meant to handle escape sequences that
10568 resolve to a literal character.
10570 Every escape sequence that represents something
10571 else, like an assertion or a char class, is handled
10572 in the switch marked 'Special Escapes' above in this
10573 routine, but also has an entry here as anything that
10574 isn't explicitly mentioned here will be treated as
10575 an unescaped equivalent literal.
10578 switch ((U8)*++p) {
10579 /* These are all the special escapes. */
10580 case 'A': /* Start assertion */
10581 case 'b': case 'B': /* Word-boundary assertion*/
10582 case 'C': /* Single char !DANGEROUS! */
10583 case 'd': case 'D': /* digit class */
10584 case 'g': case 'G': /* generic-backref, pos assertion */
10585 case 'h': case 'H': /* HORIZWS */
10586 case 'k': case 'K': /* named backref, keep marker */
10587 case 'p': case 'P': /* Unicode property */
10588 case 'R': /* LNBREAK */
10589 case 's': case 'S': /* space class */
10590 case 'v': case 'V': /* VERTWS */
10591 case 'w': case 'W': /* word class */
10592 case 'X': /* eXtended Unicode "combining character sequence" */
10593 case 'z': case 'Z': /* End of line/string assertion */
10597 /* Anything after here is an escape that resolves to a
10598 literal. (Except digits, which may or may not)
10604 case 'N': /* Handle a single-code point named character. */
10605 /* The options cause it to fail if a multiple code
10606 * point sequence. Handle those in the switch() above
10608 RExC_parse = p + 1;
10609 if (! grok_bslash_N(pRExC_state, NULL, &ender,
10610 flagp, depth, FALSE))
10612 RExC_parse = p = oldp;
10616 if (ender > 0xff) {
10633 ender = ASCII_TO_NATIVE('\033');
10637 ender = ASCII_TO_NATIVE('\007');
10642 STRLEN brace_len = len;
10644 const char* error_msg;
10646 bool valid = grok_bslash_o(p,
10653 RExC_parse = p; /* going to die anyway; point
10654 to exact spot of failure */
10661 if (PL_encoding && ender < 0x100) {
10662 goto recode_encoding;
10664 if (ender > 0xff) {
10671 STRLEN brace_len = len;
10673 const char* error_msg;
10675 bool valid = grok_bslash_x(p,
10682 RExC_parse = p; /* going to die anyway; point
10683 to exact spot of failure */
10689 if (PL_encoding && ender < 0x100) {
10690 goto recode_encoding;
10692 if (ender > 0xff) {
10699 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
10701 case '0': case '1': case '2': case '3':case '4':
10702 case '5': case '6': case '7':
10704 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
10706 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
10708 ender = grok_oct(p, &numlen, &flags, NULL);
10709 if (ender > 0xff) {
10718 if (PL_encoding && ender < 0x100)
10719 goto recode_encoding;
10722 if (! RExC_override_recoding) {
10723 SV* enc = PL_encoding;
10724 ender = reg_recode((const char)(U8)ender, &enc);
10725 if (!enc && SIZE_ONLY)
10726 ckWARNreg(p, "Invalid escape in the specified encoding");
10732 FAIL("Trailing \\");
10735 if (!SIZE_ONLY&& isALNUMC(*p)) {
10736 ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
10738 goto normal_default;
10742 /* Currently we don't warn when the lbrace is at the start
10743 * of a construct. This catches it in the middle of a
10744 * literal string, or when its the first thing after
10745 * something like "\b" */
10747 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
10749 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
10754 if (UTF8_IS_START(*p) && UTF) {
10756 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
10757 &numlen, UTF8_ALLOW_DEFAULT);
10763 } /* End of switch on the literal */
10765 /* Here, have looked at the literal character and <ender>
10766 * contains its ordinal, <p> points to the character after it
10769 if ( RExC_flags & RXf_PMf_EXTENDED)
10770 p = regwhite( pRExC_state, p );
10772 /* If the next thing is a quantifier, it applies to this
10773 * character only, which means that this character has to be in
10774 * its own node and can't just be appended to the string in an
10775 * existing node, so if there are already other characters in
10776 * the node, close the node with just them, and set up to do
10777 * this character again next time through, when it will be the
10778 * only thing in its new node */
10779 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
10787 /* See comments for join_exact() as to why we fold
10788 * this non-UTF at compile time */
10789 || (node_type == EXACTFU
10790 && ender == LATIN_SMALL_LETTER_SHARP_S))
10794 /* Prime the casefolded buffer. Locale rules, which
10795 * apply only to code points < 256, aren't known until
10796 * execution, so for them, just output the original
10797 * character using utf8. If we start to fold non-UTF
10798 * patterns, be sure to update join_exact() */
10799 if (LOC && ender < 256) {
10800 if (UNI_IS_INVARIANT(ender)) {
10804 *s = UTF8_TWO_BYTE_HI(ender);
10805 *(s + 1) = UTF8_TWO_BYTE_LO(ender);
10810 UV folded = _to_uni_fold_flags(
10815 | ((LOC) ? FOLD_FLAGS_LOCALE
10816 : (ASCII_FOLD_RESTRICTED)
10817 ? FOLD_FLAGS_NOMIX_ASCII
10821 /* If this node only contains non-folding code
10822 * points so far, see if this new one is also
10825 if (folded != ender) {
10826 maybe_exact = FALSE;
10829 /* Here the fold is the original; we have
10830 * to check further to see if anything
10832 if (! PL_utf8_foldable) {
10833 SV* swash = swash_init("utf8",
10835 &PL_sv_undef, 1, 0);
10837 _get_swash_invlist(swash);
10838 SvREFCNT_dec(swash);
10840 if (_invlist_contains_cp(PL_utf8_foldable,
10843 maybe_exact = FALSE;
10851 /* The loop increments <len> each time, as all but this
10852 * path (and the one just below for UTF) through it add
10853 * a single byte to the EXACTish node. But this one
10854 * has changed len to be the correct final value, so
10855 * subtract one to cancel out the increment that
10857 len += foldlen - 1;
10861 maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
10865 const STRLEN unilen = reguni(pRExC_state, ender, s);
10871 /* See comment just above for - 1 */
10875 REGC((char)ender, s++);
10878 if (next_is_quantifier) {
10880 /* Here, the next input is a quantifier, and to get here,
10881 * the current character is the only one in the node.
10882 * Also, here <len> doesn't include the final byte for this
10888 } /* End of loop through literal characters */
10890 /* Here we have either exhausted the input or ran out of room in
10891 * the node. (If we encountered a character that can't be in the
10892 * node, transfer is made directly to <loopdone>, and so we
10893 * wouldn't have fallen off the end of the loop.) In the latter
10894 * case, we artificially have to split the node into two, because
10895 * we just don't have enough space to hold everything. This
10896 * creates a problem if the final character participates in a
10897 * multi-character fold in the non-final position, as a match that
10898 * should have occurred won't, due to the way nodes are matched,
10899 * and our artificial boundary. So back off until we find a non-
10900 * problematic character -- one that isn't at the beginning or
10901 * middle of such a fold. (Either it doesn't participate in any
10902 * folds, or appears only in the final position of all the folds it
10903 * does participate in.) A better solution with far fewer false
10904 * positives, and that would fill the nodes more completely, would
10905 * be to actually have available all the multi-character folds to
10906 * test against, and to back-off only far enough to be sure that
10907 * this node isn't ending with a partial one. <upper_parse> is set
10908 * further below (if we need to reparse the node) to include just
10909 * up through that final non-problematic character that this code
10910 * identifies, so when it is set to less than the full node, we can
10911 * skip the rest of this */
10912 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
10914 const STRLEN full_len = len;
10916 assert(len >= MAX_NODE_STRING_SIZE);
10918 /* Here, <s> points to the final byte of the final character.
10919 * Look backwards through the string until find a non-
10920 * problematic character */
10924 /* These two have no multi-char folds to non-UTF characters
10926 if (ASCII_FOLD_RESTRICTED || LOC) {
10930 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
10934 if (! PL_NonL1NonFinalFold) {
10935 PL_NonL1NonFinalFold = _new_invlist_C_array(
10936 NonL1_Perl_Non_Final_Folds_invlist);
10939 /* Point to the first byte of the final character */
10940 s = (char *) utf8_hop((U8 *) s, -1);
10942 while (s >= s0) { /* Search backwards until find
10943 non-problematic char */
10944 if (UTF8_IS_INVARIANT(*s)) {
10946 /* There are no ascii characters that participate
10947 * in multi-char folds under /aa. In EBCDIC, the
10948 * non-ascii invariants are all control characters,
10949 * so don't ever participate in any folds. */
10950 if (ASCII_FOLD_RESTRICTED
10951 || ! IS_NON_FINAL_FOLD(*s))
10956 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
10958 /* No Latin1 characters participate in multi-char
10959 * folds under /l */
10961 || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
10967 else if (! _invlist_contains_cp(
10968 PL_NonL1NonFinalFold,
10969 valid_utf8_to_uvchr((U8 *) s, NULL)))
10974 /* Here, the current character is problematic in that
10975 * it does occur in the non-final position of some
10976 * fold, so try the character before it, but have to
10977 * special case the very first byte in the string, so
10978 * we don't read outside the string */
10979 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
10980 } /* End of loop backwards through the string */
10982 /* If there were only problematic characters in the string,
10983 * <s> will point to before s0, in which case the length
10984 * should be 0, otherwise include the length of the
10985 * non-problematic character just found */
10986 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
10989 /* Here, have found the final character, if any, that is
10990 * non-problematic as far as ending the node without splitting
10991 * it across a potential multi-char fold. <len> contains the
10992 * number of bytes in the node up-to and including that
10993 * character, or is 0 if there is no such character, meaning
10994 * the whole node contains only problematic characters. In
10995 * this case, give up and just take the node as-is. We can't
11001 /* Here, the node does contain some characters that aren't
11002 * problematic. If one such is the final character in the
11003 * node, we are done */
11004 if (len == full_len) {
11007 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
11009 /* If the final character is problematic, but the
11010 * penultimate is not, back-off that last character to
11011 * later start a new node with it */
11016 /* Here, the final non-problematic character is earlier
11017 * in the input than the penultimate character. What we do
11018 * is reparse from the beginning, going up only as far as
11019 * this final ok one, thus guaranteeing that the node ends
11020 * in an acceptable character. The reason we reparse is
11021 * that we know how far in the character is, but we don't
11022 * know how to correlate its position with the input parse.
11023 * An alternate implementation would be to build that
11024 * correlation as we go along during the original parse,
11025 * but that would entail extra work for every node, whereas
11026 * this code gets executed only when the string is too
11027 * large for the node, and the final two characters are
11028 * problematic, an infrequent occurrence. Yet another
11029 * possible strategy would be to save the tail of the
11030 * string, and the next time regatom is called, initialize
11031 * with that. The problem with this is that unless you
11032 * back off one more character, you won't be guaranteed
11033 * regatom will get called again, unless regbranch,
11034 * regpiece ... are also changed. If you do back off that
11035 * extra character, so that there is input guaranteed to
11036 * force calling regatom, you can't handle the case where
11037 * just the first character in the node is acceptable. I
11038 * (khw) decided to try this method which doesn't have that
11039 * pitfall; if performance issues are found, we can do a
11040 * combination of the current approach plus that one */
11046 } /* End of verifying node ends with an appropriate char */
11048 loopdone: /* Jumped to when encounters something that shouldn't be in
11051 /* If 'maybe_exact' is still set here, means there are no
11052 * code points in the node that participate in folds */
11053 if (FOLD && maybe_exact) {
11057 /* I (khw) don't know if you can get here with zero length, but the
11058 * old code handled this situation by creating a zero-length EXACT
11059 * node. Might as well be NOTHING instead */
11064 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
11067 RExC_parse = p - 1;
11068 Set_Node_Cur_Length(ret); /* MJD */
11069 nextchar(pRExC_state);
11071 /* len is STRLEN which is unsigned, need to copy to signed */
11074 vFAIL("Internal disaster");
11077 } /* End of label 'defchar:' */
11079 } /* End of giant switch on input character */
11085 S_regwhite( RExC_state_t *pRExC_state, char *p )
11087 const char *e = RExC_end;
11089 PERL_ARGS_ASSERT_REGWHITE;
11094 else if (*p == '#') {
11097 if (*p++ == '\n') {
11103 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11111 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
11112 Character classes ([:foo:]) can also be negated ([:^foo:]).
11113 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
11114 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
11115 but trigger failures because they are currently unimplemented. */
11117 #define POSIXCC_DONE(c) ((c) == ':')
11118 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
11119 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
11122 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
11125 I32 namedclass = OOB_NAMEDCLASS;
11127 PERL_ARGS_ASSERT_REGPPOSIXCC;
11129 if (value == '[' && RExC_parse + 1 < RExC_end &&
11130 /* I smell either [: or [= or [. -- POSIX has been here, right? */
11131 POSIXCC(UCHARAT(RExC_parse))) {
11132 const char c = UCHARAT(RExC_parse);
11133 char* const s = RExC_parse++;
11135 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
11137 if (RExC_parse == RExC_end)
11138 /* Grandfather lone [:, [=, [. */
11141 const char* const t = RExC_parse++; /* skip over the c */
11144 if (UCHARAT(RExC_parse) == ']') {
11145 const char *posixcc = s + 1;
11146 RExC_parse++; /* skip over the ending ] */
11149 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
11150 const I32 skip = t - posixcc;
11152 /* Initially switch on the length of the name. */
11155 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
11156 namedclass = ANYOF_WORDCHAR;
11159 /* Names all of length 5. */
11160 /* alnum alpha ascii blank cntrl digit graph lower
11161 print punct space upper */
11162 /* Offset 4 gives the best switch position. */
11163 switch (posixcc[4]) {
11165 if (memEQ(posixcc, "alph", 4)) /* alpha */
11166 namedclass = ANYOF_ALPHA;
11169 if (memEQ(posixcc, "spac", 4)) /* space */
11170 namedclass = ANYOF_PSXSPC;
11173 if (memEQ(posixcc, "grap", 4)) /* graph */
11174 namedclass = ANYOF_GRAPH;
11177 if (memEQ(posixcc, "asci", 4)) /* ascii */
11178 namedclass = ANYOF_ASCII;
11181 if (memEQ(posixcc, "blan", 4)) /* blank */
11182 namedclass = ANYOF_BLANK;
11185 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
11186 namedclass = ANYOF_CNTRL;
11189 if (memEQ(posixcc, "alnu", 4)) /* alnum */
11190 namedclass = ANYOF_ALNUMC;
11193 if (memEQ(posixcc, "lowe", 4)) /* lower */
11194 namedclass = ANYOF_LOWER;
11195 else if (memEQ(posixcc, "uppe", 4)) /* upper */
11196 namedclass = ANYOF_UPPER;
11199 if (memEQ(posixcc, "digi", 4)) /* digit */
11200 namedclass = ANYOF_DIGIT;
11201 else if (memEQ(posixcc, "prin", 4)) /* print */
11202 namedclass = ANYOF_PRINT;
11203 else if (memEQ(posixcc, "punc", 4)) /* punct */
11204 namedclass = ANYOF_PUNCT;
11209 if (memEQ(posixcc, "xdigit", 6))
11210 namedclass = ANYOF_XDIGIT;
11214 if (namedclass == OOB_NAMEDCLASS)
11215 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
11218 /* The #defines are structured so each complement is +1 to
11219 * the normal one */
11223 assert (posixcc[skip] == ':');
11224 assert (posixcc[skip+1] == ']');
11225 } else if (!SIZE_ONLY) {
11226 /* [[=foo=]] and [[.foo.]] are still future. */
11228 /* adjust RExC_parse so the warning shows after
11229 the class closes */
11230 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
11232 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11235 /* Maternal grandfather:
11236 * "[:" ending in ":" but not in ":]" */
11246 S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
11250 PERL_ARGS_ASSERT_CHECKPOSIXCC;
11252 if (POSIXCC(UCHARAT(RExC_parse))) {
11253 const char *s = RExC_parse;
11254 const char c = *s++;
11256 while (isALNUM(*s))
11258 if (*s && c == *s && s[1] == ']') {
11260 "POSIX syntax [%c %c] belongs inside character classes",
11263 /* [[=foo=]] and [[.foo.]] are still future. */
11264 if (POSIXCC_NOTYET(c)) {
11265 /* adjust RExC_parse so the error shows after
11266 the class closes */
11267 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
11269 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
11275 /* Generate the code to add a full posix character <class> to the bracketed
11276 * character class given by <node>. (<node> is needed only under locale rules)
11277 * destlist is the inversion list for non-locale rules that this class is
11279 * sourcelist is the ASCII-range inversion list to add under /a rules
11280 * Xsourcelist is the full Unicode range list to use otherwise. */
11281 #define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11283 SV* scratch_list = NULL; \
11285 /* Set this class in the node for runtime matching */ \
11286 ANYOF_CLASS_SET(node, class); \
11288 /* For above Latin1 code points, we use the full Unicode range */ \
11289 _invlist_intersection(PL_AboveLatin1, \
11292 /* And set the output to it, adding instead if there already is an \
11293 * output. Checking if <destlist> is NULL first saves an extra \
11294 * clone. Its reference count will be decremented at the next \
11295 * union, etc, or if this is the only instance, at the end of the \
11297 if (! destlist) { \
11298 destlist = scratch_list; \
11301 _invlist_union(destlist, scratch_list, &destlist); \
11302 SvREFCNT_dec(scratch_list); \
11306 /* For non-locale, just add it to any existing list */ \
11307 _invlist_union(destlist, \
11308 (AT_LEAST_ASCII_RESTRICTED) \
11314 /* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
11316 #define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
11318 SV* scratch_list = NULL; \
11319 ANYOF_CLASS_SET(node, class); \
11320 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
11321 if (! destlist) { \
11322 destlist = scratch_list; \
11325 _invlist_union(destlist, scratch_list, &destlist); \
11326 SvREFCNT_dec(scratch_list); \
11330 _invlist_union_complement_2nd(destlist, \
11331 (AT_LEAST_ASCII_RESTRICTED) \
11335 /* Under /d, everything in the upper half of the Latin1 range \
11336 * matches this complement */ \
11337 if (DEPENDS_SEMANTICS) { \
11338 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11342 /* Generate the code to add a posix character <class> to the bracketed
11343 * character class given by <node>. (<node> is needed only under locale rules)
11344 * destlist is the inversion list for non-locale rules that this class is
11346 * sourcelist is the ASCII-range inversion list to add under /a rules
11347 * l1_sourcelist is the Latin1 range list to use otherwise.
11348 * Xpropertyname is the name to add to <run_time_list> of the property to
11349 * specify the code points above Latin1 that will have to be
11350 * determined at run-time
11351 * run_time_list is a SV* that contains text names of properties that are to
11352 * be computed at run time. This concatenates <Xpropertyname>
11353 * to it, appropriately
11354 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
11356 #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11357 l1_sourcelist, Xpropertyname, run_time_list) \
11358 /* First, resolve whether to use the ASCII-only list or the L1 \
11360 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
11361 ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
11362 Xpropertyname, run_time_list)
11364 #define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
11365 Xpropertyname, run_time_list) \
11366 /* If not /a matching, there are going to be code points we will have \
11367 * to defer to runtime to look-up */ \
11368 if (! AT_LEAST_ASCII_RESTRICTED) { \
11369 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
11372 ANYOF_CLASS_SET(node, class); \
11375 _invlist_union(destlist, sourcelist, &destlist); \
11378 /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
11379 * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
11381 #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
11382 l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
11383 if (AT_LEAST_ASCII_RESTRICTED) { \
11384 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
11387 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
11388 matches_above_unicode = TRUE; \
11390 ANYOF_CLASS_SET(node, namedclass); \
11393 SV* scratch_list = NULL; \
11394 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
11395 if (! destlist) { \
11396 destlist = scratch_list; \
11399 _invlist_union(destlist, scratch_list, &destlist); \
11400 SvREFCNT_dec(scratch_list); \
11402 if (DEPENDS_SEMANTICS) { \
11403 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
11408 /* The names of properties whose definitions are not known at compile time are
11409 * stored in this SV, after a constant heading. So if the length has been
11410 * changed since initialization, then there is a run-time definition. */
11411 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
11413 /* This converts the named class defined in regcomp.h to its equivalent class
11414 * number defined in handy.h. */
11415 #define namedclass_to_classnum(class) ((class) / 2)
11418 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11420 /* parse a bracketed class specification. Most of these will produce an ANYOF node;
11421 * but something like [a] will produce an EXACT node; [aA], an EXACTFish
11422 * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
11423 * multi-character folds: it will be rewritten following the paradigm of
11424 * this example, where the <multi-fold>s are characters which fold to
11425 * multiple character sequences:
11426 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
11427 * gets effectively rewritten as:
11428 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
11429 * reg() gets called (recursively) on the rewritten version, and this
11430 * function will return what it constructs. (Actually the <multi-fold>s
11431 * aren't physically removed from the [abcdefghi], it's just that they are
11432 * ignored in the recursion by means of a a flag:
11433 * <RExC_in_multi_char_class>.)
11435 * ANYOF nodes contain a bit map for the first 256 characters, with the
11436 * corresponding bit set if that character is in the list. For characters
11437 * above 255, a range list or swash is used. There are extra bits for \w,
11438 * etc. in locale ANYOFs, as what these match is not determinable at
11443 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
11445 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
11448 IV namedclass = OOB_NAMEDCLASS;
11449 char *rangebegin = NULL;
11450 bool need_class = 0;
11451 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
11453 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
11454 than just initialized. */
11455 SV* properties = NULL; /* Code points that match \p{} \P{} */
11456 SV* posixes = NULL; /* Code points that match classes like, [:word:],
11457 extended beyond the Latin1 range */
11458 UV element_count = 0; /* Number of distinct elements in the class.
11459 Optimizations may be possible if this is tiny */
11460 AV * multi_char_matches = NULL; /* Code points that fold to more than one
11461 character; used under /i */
11464 /* Unicode properties are stored in a swash; this holds the current one
11465 * being parsed. If this swash is the only above-latin1 component of the
11466 * character class, an optimization is to pass it directly on to the
11467 * execution engine. Otherwise, it is set to NULL to indicate that there
11468 * are other things in the class that have to be dealt with at execution
11470 SV* swash = NULL; /* Code points that match \p{} \P{} */
11472 /* Set if a component of this character class is user-defined; just passed
11473 * on to the engine */
11474 bool has_user_defined_property = FALSE;
11476 /* inversion list of code points this node matches only when the target
11477 * string is in UTF-8. (Because is under /d) */
11478 SV* depends_list = NULL;
11480 /* inversion list of code points this node matches. For much of the
11481 * function, it includes only those that match regardless of the utf8ness
11482 * of the target string */
11483 SV* cp_list = NULL;
11486 /* In a range, counts how many 0-2 of the ends of it came from literals,
11487 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
11488 UV literal_endpoint = 0;
11490 bool invert = FALSE; /* Is this class to be complemented */
11492 /* Is there any thing like \W or [:^digit:] that matches above the legal
11493 * Unicode range? */
11494 bool runtime_posix_matches_above_Unicode = FALSE;
11496 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
11497 case we need to change the emitted regop to an EXACT. */
11498 const char * orig_parse = RExC_parse;
11499 const I32 orig_size = RExC_size;
11500 GET_RE_DEBUG_FLAGS_DECL;
11502 PERL_ARGS_ASSERT_REGCLASS;
11504 PERL_UNUSED_ARG(depth);
11507 DEBUG_PARSE("clas");
11509 /* Assume we are going to generate an ANYOF node. */
11510 ret = reganode(pRExC_state, ANYOF, 0);
11513 ANYOF_FLAGS(ret) = 0;
11516 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
11518 if (! RExC_in_multi_char_class) {
11522 /* We have decided to not allow multi-char folds in inverted
11523 * character classes, due to the confusion that can happen,
11524 * especially with classes that are designed for a non-Unicode
11525 * world: You have the peculiar case that:
11526 "s s" =~ /^[^\xDF]+$/i => Y
11527 "ss" =~ /^[^\xDF]+$/i => N
11529 * See [perl #89750] */
11530 allow_full_fold = FALSE;
11535 RExC_size += ANYOF_SKIP;
11536 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
11539 RExC_emit += ANYOF_SKIP;
11541 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
11543 listsv = newSVpvs("# comment\n");
11544 initial_listsv_len = SvCUR(listsv);
11547 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11549 if (!SIZE_ONLY && POSIXCC(nextvalue))
11550 checkposixcc(pRExC_state);
11552 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
11553 if (UCHARAT(RExC_parse) == ']')
11554 goto charclassloop;
11557 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
11561 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
11562 save_value = value;
11563 save_prevvalue = prevvalue;
11566 rangebegin = RExC_parse;
11570 value = utf8n_to_uvchr((U8*)RExC_parse,
11571 RExC_end - RExC_parse,
11572 &numlen, UTF8_ALLOW_DEFAULT);
11573 RExC_parse += numlen;
11576 value = UCHARAT(RExC_parse++);
11578 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
11579 if (value == '[' && POSIXCC(nextvalue))
11580 namedclass = regpposixcc(pRExC_state, value);
11581 else if (value == '\\') {
11583 value = utf8n_to_uvchr((U8*)RExC_parse,
11584 RExC_end - RExC_parse,
11585 &numlen, UTF8_ALLOW_DEFAULT);
11586 RExC_parse += numlen;
11589 value = UCHARAT(RExC_parse++);
11590 /* Some compilers cannot handle switching on 64-bit integer
11591 * values, therefore value cannot be an UV. Yes, this will
11592 * be a problem later if we want switch on Unicode.
11593 * A similar issue a little bit later when switching on
11594 * namedclass. --jhi */
11595 switch ((I32)value) {
11596 case 'w': namedclass = ANYOF_WORDCHAR; break;
11597 case 'W': namedclass = ANYOF_NWORDCHAR; break;
11598 case 's': namedclass = ANYOF_SPACE; break;
11599 case 'S': namedclass = ANYOF_NSPACE; break;
11600 case 'd': namedclass = ANYOF_DIGIT; break;
11601 case 'D': namedclass = ANYOF_NDIGIT; break;
11602 case 'v': namedclass = ANYOF_VERTWS; break;
11603 case 'V': namedclass = ANYOF_NVERTWS; break;
11604 case 'h': namedclass = ANYOF_HORIZWS; break;
11605 case 'H': namedclass = ANYOF_NHORIZWS; break;
11606 case 'N': /* Handle \N{NAME} in class */
11608 /* We only pay attention to the first char of
11609 multichar strings being returned. I kinda wonder
11610 if this makes sense as it does change the behaviour
11611 from earlier versions, OTOH that behaviour was broken
11613 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
11614 TRUE /* => charclass */))
11625 /* This routine will handle any undefined properties */
11626 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
11628 if (RExC_parse >= RExC_end)
11629 vFAIL2("Empty \\%c{}", (U8)value);
11630 if (*RExC_parse == '{') {
11631 const U8 c = (U8)value;
11632 e = strchr(RExC_parse++, '}');
11634 vFAIL2("Missing right brace on \\%c{}", c);
11635 while (isSPACE(UCHARAT(RExC_parse)))
11637 if (e == RExC_parse)
11638 vFAIL2("Empty \\%c{}", c);
11639 n = e - RExC_parse;
11640 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
11651 if (UCHARAT(RExC_parse) == '^') {
11654 value = value == 'p' ? 'P' : 'p'; /* toggle */
11655 while (isSPACE(UCHARAT(RExC_parse))) {
11660 /* Try to get the definition of the property into
11661 * <invlist>. If /i is in effect, the effective property
11662 * will have its name be <__NAME_i>. The design is
11663 * discussed in commit
11664 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
11665 Newx(name, n + sizeof("_i__\n"), char);
11667 sprintf(name, "%s%.*s%s\n",
11668 (FOLD) ? "__" : "",
11674 /* Look up the property name, and get its swash and
11675 * inversion list, if the property is found */
11677 SvREFCNT_dec(swash);
11679 swash = _core_swash_init("utf8", name, &PL_sv_undef,
11682 NULL, /* No inversion list */
11685 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
11687 SvREFCNT_dec(swash);
11691 /* Here didn't find it. It could be a user-defined
11692 * property that will be available at run-time. Add it
11693 * to the list to look up then */
11694 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
11695 (value == 'p' ? '+' : '!'),
11697 has_user_defined_property = TRUE;
11699 /* We don't know yet, so have to assume that the
11700 * property could match something in the Latin1 range,
11701 * hence something that isn't utf8. Note that this
11702 * would cause things in <depends_list> to match
11703 * inappropriately, except that any \p{}, including
11704 * this one forces Unicode semantics, which means there
11705 * is <no depends_list> */
11706 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
11710 /* Here, did get the swash and its inversion list. If
11711 * the swash is from a user-defined property, then this
11712 * whole character class should be regarded as such */
11713 has_user_defined_property =
11715 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
11717 /* Invert if asking for the complement */
11718 if (value == 'P') {
11719 _invlist_union_complement_2nd(properties,
11723 /* The swash can't be used as-is, because we've
11724 * inverted things; delay removing it to here after
11725 * have copied its invlist above */
11726 SvREFCNT_dec(swash);
11730 _invlist_union(properties, invlist, &properties);
11735 RExC_parse = e + 1;
11736 namedclass = ANYOF_MAX; /* no official name, but it's named */
11738 /* \p means they want Unicode semantics */
11739 RExC_uni_semantics = 1;
11742 case 'n': value = '\n'; break;
11743 case 'r': value = '\r'; break;
11744 case 't': value = '\t'; break;
11745 case 'f': value = '\f'; break;
11746 case 'b': value = '\b'; break;
11747 case 'e': value = ASCII_TO_NATIVE('\033');break;
11748 case 'a': value = ASCII_TO_NATIVE('\007');break;
11750 RExC_parse--; /* function expects to be pointed at the 'o' */
11752 const char* error_msg;
11753 bool valid = grok_bslash_o(RExC_parse,
11758 RExC_parse += numlen;
11763 if (PL_encoding && value < 0x100) {
11764 goto recode_encoding;
11768 RExC_parse--; /* function expects to be pointed at the 'x' */
11770 const char* error_msg;
11771 bool valid = grok_bslash_x(RExC_parse,
11776 RExC_parse += numlen;
11781 if (PL_encoding && value < 0x100)
11782 goto recode_encoding;
11785 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
11787 case '0': case '1': case '2': case '3': case '4':
11788 case '5': case '6': case '7':
11790 /* Take 1-3 octal digits */
11791 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11793 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
11794 RExC_parse += numlen;
11795 if (PL_encoding && value < 0x100)
11796 goto recode_encoding;
11800 if (! RExC_override_recoding) {
11801 SV* enc = PL_encoding;
11802 value = reg_recode((const char)(U8)value, &enc);
11803 if (!enc && SIZE_ONLY)
11804 ckWARNreg(RExC_parse,
11805 "Invalid escape in the specified encoding");
11809 /* Allow \_ to not give an error */
11810 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
11811 ckWARN2reg(RExC_parse,
11812 "Unrecognized escape \\%c in character class passed through",
11817 } /* end of \blah */
11820 literal_endpoint++;
11823 /* What matches in a locale is not known until runtime. This
11824 * includes what the Posix classes (like \w, [:space:]) match.
11825 * Room must be reserved (one time per class) to store such
11826 * classes, either if Perl is compiled so that locale nodes always
11827 * should have this space, or if there is such class info to be
11828 * stored. The space will contain a bit for each named class that
11829 * is to be matched against. This isn't needed for \p{} and
11830 * pseudo-classes, as they are not affected by locale, and hence
11831 * are dealt with separately */
11834 && (ANYOF_LOCALE == ANYOF_CLASS
11835 || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
11839 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11842 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
11843 ANYOF_CLASS_ZERO(ret);
11845 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
11848 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
11850 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
11851 * literal, as is the character that began the false range, i.e.
11852 * the 'a' in the examples */
11856 RExC_parse >= rangebegin ?
11857 RExC_parse - rangebegin : 0;
11858 ckWARN4reg(RExC_parse,
11859 "False [] range \"%*.*s\"",
11861 cp_list = add_cp_to_invlist(cp_list, '-');
11862 cp_list = add_cp_to_invlist(cp_list, prevvalue);
11865 range = 0; /* this was not a true range */
11866 element_count += 2; /* So counts for three values */
11870 switch ((I32)namedclass) {
11872 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
11873 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11874 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
11876 case ANYOF_NALNUMC:
11877 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11878 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
11879 runtime_posix_matches_above_Unicode);
11882 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11883 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
11886 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11887 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
11888 runtime_posix_matches_above_Unicode);
11893 ANYOF_CLASS_SET(ret, namedclass);
11896 #endif /* Not isascii(); just use the hard-coded definition for it */
11897 _invlist_union(posixes, PL_ASCII, &posixes);
11902 ANYOF_CLASS_SET(ret, namedclass);
11906 _invlist_union_complement_2nd(posixes,
11907 PL_ASCII, &posixes);
11908 if (DEPENDS_SEMANTICS) {
11909 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
11916 if (hasISBLANK || ! LOC) {
11917 DO_POSIX(ret, namedclass, posixes,
11918 PL_PosixBlank, PL_XPosixBlank);
11920 else { /* There is no isblank() and we are in locale: We
11921 use the ASCII range and the above-Latin1 range
11923 SV* scratch_list = NULL;
11925 /* Include all above-Latin1 blanks */
11926 _invlist_intersection(PL_AboveLatin1,
11929 /* Add it to the running total of posix classes */
11931 posixes = scratch_list;
11934 _invlist_union(posixes, scratch_list, &posixes);
11935 SvREFCNT_dec(scratch_list);
11937 /* Add the ASCII-range blanks to the running total. */
11938 _invlist_union(posixes, PL_PosixBlank, &posixes);
11942 if (hasISBLANK || ! LOC) {
11943 DO_N_POSIX(ret, namedclass, posixes,
11944 PL_PosixBlank, PL_XPosixBlank);
11946 else { /* There is no isblank() and we are in locale */
11947 SV* scratch_list = NULL;
11949 /* Include all above-Latin1 non-blanks */
11950 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11953 /* Add them to the running total of posix classes */
11954 _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
11957 posixes = scratch_list;
11960 _invlist_union(posixes, scratch_list, &posixes);
11961 SvREFCNT_dec(scratch_list);
11964 /* Get the list of all non-ASCII-blanks in Latin 1, and
11965 * add them to the running total */
11966 _invlist_subtract(PL_Latin1, PL_PosixBlank,
11968 _invlist_union(posixes, scratch_list, &posixes);
11969 SvREFCNT_dec(scratch_list);
11973 DO_POSIX(ret, namedclass, posixes,
11974 PL_PosixCntrl, PL_XPosixCntrl);
11977 DO_N_POSIX(ret, namedclass, posixes,
11978 PL_PosixCntrl, PL_XPosixCntrl);
11981 /* There are no digits in the Latin1 range outside of
11982 * ASCII, so call the macro that doesn't have to resolve
11984 DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
11985 PL_PosixDigit, "XPosixDigit", listsv);
11988 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11989 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
11990 runtime_posix_matches_above_Unicode);
11993 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11994 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
11997 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
11998 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
11999 runtime_posix_matches_above_Unicode);
12001 case ANYOF_HORIZWS:
12002 /* For these, we use the cp_list, as /d doesn't make a
12003 * difference in what these match. There would be problems
12004 * if these characters had folds other than themselves, as
12005 * cp_list is subject to folding. It turns out that \h
12006 * is just a synonym for XPosixBlank */
12007 _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
12009 case ANYOF_NHORIZWS:
12010 _invlist_union_complement_2nd(cp_list,
12011 PL_XPosixBlank, &cp_list);
12015 { /* These require special handling, as they differ under
12016 folding, matching Cased there (which in the ASCII range
12017 is the same as Alpha */
12023 if (FOLD && ! LOC) {
12024 ascii_source = PL_PosixAlpha;
12025 l1_source = PL_L1Cased;
12029 ascii_source = PL_PosixLower;
12030 l1_source = PL_L1PosixLower;
12031 Xname = "XPosixLower";
12033 if (namedclass == ANYOF_LOWER) {
12034 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12035 ascii_source, l1_source, Xname, listsv);
12038 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12039 posixes, ascii_source, l1_source, Xname, listsv,
12040 runtime_posix_matches_above_Unicode);
12045 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12046 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
12049 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12050 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
12051 runtime_posix_matches_above_Unicode);
12054 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12055 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
12058 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12059 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
12060 runtime_posix_matches_above_Unicode);
12063 DO_POSIX(ret, namedclass, posixes,
12064 PL_PosixSpace, PL_XPosixSpace);
12066 case ANYOF_NPSXSPC:
12067 DO_N_POSIX(ret, namedclass, posixes,
12068 PL_PosixSpace, PL_XPosixSpace);
12071 DO_POSIX(ret, namedclass, posixes,
12072 PL_PerlSpace, PL_XPerlSpace);
12075 DO_N_POSIX(ret, namedclass, posixes,
12076 PL_PerlSpace, PL_XPerlSpace);
12078 case ANYOF_UPPER: /* Same as LOWER, above */
12085 if (FOLD && ! LOC) {
12086 ascii_source = PL_PosixAlpha;
12087 l1_source = PL_L1Cased;
12091 ascii_source = PL_PosixUpper;
12092 l1_source = PL_L1PosixUpper;
12093 Xname = "XPosixUpper";
12095 if (namedclass == ANYOF_UPPER) {
12096 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12097 ascii_source, l1_source, Xname, listsv);
12100 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
12101 posixes, ascii_source, l1_source, Xname, listsv,
12102 runtime_posix_matches_above_Unicode);
12106 case ANYOF_WORDCHAR:
12107 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12108 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
12110 case ANYOF_NWORDCHAR:
12111 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
12112 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
12113 runtime_posix_matches_above_Unicode);
12116 /* For these, we use the cp_list, as /d doesn't make a
12117 * difference in what these match. There would be problems
12118 * if these characters had folds other than themselves, as
12119 * cp_list is subject to folding */
12120 _invlist_union(cp_list, PL_VertSpace, &cp_list);
12122 case ANYOF_NVERTWS:
12123 _invlist_union_complement_2nd(cp_list,
12124 PL_VertSpace, &cp_list);
12127 DO_POSIX(ret, namedclass, posixes,
12128 PL_PosixXDigit, PL_XPosixXDigit);
12130 case ANYOF_NXDIGIT:
12131 DO_N_POSIX(ret, namedclass, posixes,
12132 PL_PosixXDigit, PL_XPosixXDigit);
12135 /* this is to handle \p and \P */
12138 vFAIL("Invalid [::] class");
12142 continue; /* Go get next character */
12144 } /* end of namedclass \blah */
12147 if (prevvalue > value) /* b-a */ {
12148 const int w = RExC_parse - rangebegin;
12149 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
12150 range = 0; /* not a valid range */
12154 prevvalue = value; /* save the beginning of the potential range */
12155 if (RExC_parse+1 < RExC_end
12156 && *RExC_parse == '-'
12157 && RExC_parse[1] != ']')
12161 /* a bad range like \w-, [:word:]- ? */
12162 if (namedclass > OOB_NAMEDCLASS) {
12163 if (ckWARN(WARN_REGEXP)) {
12165 RExC_parse >= rangebegin ?
12166 RExC_parse - rangebegin : 0;
12168 "False [] range \"%*.*s\"",
12172 cp_list = add_cp_to_invlist(cp_list, '-');
12176 range = 1; /* yeah, it's a range! */
12177 continue; /* but do it the next time */
12181 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
12184 /* non-Latin1 code point implies unicode semantics. Must be set in
12185 * pass1 so is there for the whole of pass 2 */
12187 RExC_uni_semantics = 1;
12190 /* Ready to process either the single value, or the completed range.
12191 * For single-valued non-inverted ranges, we consider the possibility
12192 * of multi-char folds. (We made a conscious decision to not do this
12193 * for the other cases because it can often lead to non-intuitive
12195 if (FOLD && ! invert && value == prevvalue) {
12196 if (value == LATIN_SMALL_LETTER_SHARP_S
12197 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
12200 /* Here <value> is indeed a multi-char fold. Get what it is */
12202 U8 foldbuf[UTF8_MAXBYTES_CASE];
12205 UV folded = _to_uni_fold_flags(
12210 | ((LOC) ? FOLD_FLAGS_LOCALE
12211 : (ASCII_FOLD_RESTRICTED)
12212 ? FOLD_FLAGS_NOMIX_ASCII
12216 /* Here, <folded> should be the first character of the
12217 * multi-char fold of <value>, with <foldbuf> containing the
12218 * whole thing. But, if this fold is not allowed (because of
12219 * the flags), <fold> will be the same as <value>, and should
12220 * be processed like any other character, so skip the special
12222 if (folded != value) {
12224 /* Skip if we are recursed, currently parsing the class
12225 * again. Otherwise add this character to the list of
12226 * multi-char folds. */
12227 if (! RExC_in_multi_char_class) {
12228 AV** this_array_ptr;
12230 STRLEN cp_count = utf8_length(foldbuf,
12231 foldbuf + foldlen);
12232 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
12234 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
12237 if (! multi_char_matches) {
12238 multi_char_matches = newAV();
12241 /* <multi_char_matches> is actually an array of arrays.
12242 * There will be one or two top-level elements: [2],
12243 * and/or [3]. The [2] element is an array, each
12244 * element thereof is a character which folds to two
12245 * characters; likewise for [3]. (Unicode guarantees a
12246 * maximum of 3 characters in any fold.) When we
12247 * rewrite the character class below, we will do so
12248 * such that the longest folds are written first, so
12249 * that it prefers the longest matching strings first.
12250 * This is done even if it turns out that any
12251 * quantifier is non-greedy, out of programmer
12252 * laziness. Tom Christiansen has agreed that this is
12253 * ok. This makes the test for the ligature 'ffi' come
12254 * before the test for 'ff' */
12255 if (av_exists(multi_char_matches, cp_count)) {
12256 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12258 this_array = *this_array_ptr;
12261 this_array = newAV();
12262 av_store(multi_char_matches, cp_count,
12265 av_push(this_array, multi_fold);
12268 /* This element should not be processed further in this
12271 value = save_value;
12272 prevvalue = save_prevvalue;
12278 /* Deal with this element of the class */
12281 cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
12283 UV* this_range = _new_invlist(1);
12284 _append_range_to_invlist(this_range, prevvalue, value);
12286 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
12287 * If this range was specified using something like 'i-j', we want
12288 * to include only the 'i' and the 'j', and not anything in
12289 * between, so exclude non-ASCII, non-alphabetics from it.
12290 * However, if the range was specified with something like
12291 * [\x89-\x91] or [\x89-j], all code points within it should be
12292 * included. literal_endpoint==2 means both ends of the range used
12293 * a literal character, not \x{foo} */
12294 if (literal_endpoint == 2
12295 && (prevvalue >= 'a' && value <= 'z')
12296 || (prevvalue >= 'A' && value <= 'Z'))
12298 _invlist_intersection(this_range, PL_ASCII, &this_range, );
12299 _invlist_intersection(this_range, PL_Alpha, &this_range, );
12301 _invlist_union(cp_list, this_range, &cp_list);
12302 literal_endpoint = 0;
12306 range = 0; /* this range (if it was one) is done now */
12307 } /* End of loop through all the text within the brackets */
12309 /* If anything in the class expands to more than one character, we have to
12310 * deal with them by building up a substitute parse string, and recursively
12311 * calling reg() on it, instead of proceeding */
12312 if (multi_char_matches) {
12313 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
12316 char *save_end = RExC_end;
12317 char *save_parse = RExC_parse;
12318 bool first_time = TRUE; /* First multi-char occurrence doesn't get
12323 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
12324 because too confusing */
12326 sv_catpv(substitute_parse, "(?:");
12330 /* Look at the longest folds first */
12331 for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
12333 if (av_exists(multi_char_matches, cp_count)) {
12334 AV** this_array_ptr;
12337 this_array_ptr = (AV**) av_fetch(multi_char_matches,
12339 while ((this_sequence = av_pop(*this_array_ptr)) !=
12342 if (! first_time) {
12343 sv_catpv(substitute_parse, "|");
12345 first_time = FALSE;
12347 sv_catpv(substitute_parse, SvPVX(this_sequence));
12352 /* If the character class contains anything else besides these
12353 * multi-character folds, have to include it in recursive parsing */
12354 if (element_count) {
12355 sv_catpv(substitute_parse, "|[");
12356 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
12357 sv_catpv(substitute_parse, "]");
12360 sv_catpv(substitute_parse, ")");
12363 /* This is a way to get the parse to skip forward a whole named
12364 * sequence instead of matching the 2nd character when it fails the
12366 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
12370 RExC_parse = SvPV(substitute_parse, len);
12371 RExC_end = RExC_parse + len;
12372 RExC_in_multi_char_class = 1;
12373 RExC_emit = (regnode *)orig_emit;
12375 ret = reg(pRExC_state, 1, ®_flags, depth+1);
12377 *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
12379 RExC_parse = save_parse;
12380 RExC_end = save_end;
12381 RExC_in_multi_char_class = 0;
12382 SvREFCNT_dec(multi_char_matches);
12386 /* If the character class contains only a single element, it may be
12387 * optimizable into another node type which is smaller and runs faster.
12388 * Check if this is the case for this class */
12389 if (element_count == 1) {
12393 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
12394 [:digit:] or \p{foo} */
12396 /* Certain named classes have equivalents that can appear outside a
12397 * character class, e.g. \w, \H. We use these instead of a
12398 * character class. */
12399 switch ((I32)namedclass) {
12402 /* The first group is for node types that depend on the charset
12403 * modifier to the regex. We first calculate the base node
12404 * type, and if it should be inverted */
12406 case ANYOF_NWORDCHAR:
12409 case ANYOF_WORDCHAR:
12411 goto join_charset_classes;
12418 goto join_charset_classes;
12426 join_charset_classes:
12428 /* Now that we have the base node type, we take advantage
12429 * of the enum ordering of the charset modifiers to get the
12430 * exact node type, For example the base SPACE also has
12431 * SPACEL, SPACEU, and SPACEA */
12433 offset = get_regex_charset(RExC_flags);
12435 /* /aa is the same as /a for these */
12436 if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
12437 offset = REGEX_ASCII_RESTRICTED_CHARSET;
12439 else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
12440 offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
12445 /* The number of varieties of each of these is the same,
12446 * hence, so is the delta between the normal and
12447 * complemented nodes */
12449 op += NALNUM - ALNUM;
12451 *flagp |= HASWIDTH|SIMPLE;
12454 /* The second group doesn't depend of the charset modifiers.
12455 * We just have normal and complemented */
12456 case ANYOF_NHORIZWS:
12459 case ANYOF_HORIZWS:
12461 op = (invert) ? NHORIZWS : HORIZWS;
12462 *flagp |= HASWIDTH|SIMPLE;
12465 case ANYOF_NVERTWS:
12469 op = (invert) ? NVERTWS : VERTWS;
12470 *flagp |= HASWIDTH|SIMPLE;
12480 if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
12485 /* A generic posix class. All the /a ones can be handled
12486 * by the POSIXA opcode. And all are closed under folding
12487 * in the ASCII range, so FOLD doesn't matter */
12488 if (AT_LEAST_ASCII_RESTRICTED
12489 || (! LOC && namedclass == ANYOF_ASCII))
12491 /* The odd numbered ones are the complements of the
12492 * next-lower even number one */
12493 if (namedclass % 2 == 1) {
12497 arg = namedclass_to_classnum(namedclass);
12498 op = (invert) ? NPOSIXA : POSIXA;
12503 else if (value == prevvalue) {
12505 /* Here, the class consists of just a single code point */
12508 if (! LOC && value == '\n') {
12509 op = REG_ANY; /* Optimize [^\n] */
12510 *flagp |= HASWIDTH|SIMPLE;
12514 else if (value < 256 || UTF) {
12516 /* Optimize a single value into an EXACTish node, but not if it
12517 * would require converting the pattern to UTF-8. */
12518 op = compute_EXACTish(pRExC_state);
12520 } /* Otherwise is a range */
12521 else if (! LOC) { /* locale could vary these */
12522 if (prevvalue == '0') {
12523 if (value == '9') {
12524 op = (invert) ? NDIGITA : DIGITA;
12525 *flagp |= HASWIDTH|SIMPLE;
12530 /* Here, we have changed <op> away from its initial value iff we found
12531 * an optimization */
12534 /* Throw away this ANYOF regnode, and emit the calculated one,
12535 * which should correspond to the beginning, not current, state of
12537 const char * cur_parse = RExC_parse;
12538 RExC_parse = (char *)orig_parse;
12542 /* To get locale nodes to not use the full ANYOF size would
12543 * require moving the code above that writes the portions
12544 * of it that aren't in other nodes to after this point.
12545 * e.g. ANYOF_CLASS_SET */
12546 RExC_size = orig_size;
12550 RExC_emit = (regnode *)orig_emit;
12553 ret = reg_node(pRExC_state, op);
12555 if (PL_regkind[op] == POSIXD) {
12559 *flagp |= HASWIDTH|SIMPLE;
12561 else if (PL_regkind[op] == EXACT) {
12562 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
12565 RExC_parse = (char *) cur_parse;
12567 SvREFCNT_dec(listsv);
12574 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
12576 /* If folding, we calculate all characters that could fold to or from the
12577 * ones already on the list */
12578 if (FOLD && cp_list) {
12579 UV start, end; /* End points of code point ranges */
12581 SV* fold_intersection = NULL;
12583 /* If the highest code point is within Latin1, we can use the
12584 * compiled-in Alphas list, and not have to go out to disk. This
12585 * yields two false positives, the masculine and feminine oridinal
12586 * indicators, which are weeded out below using the
12587 * IS_IN_SOME_FOLD_L1() macro */
12588 if (invlist_highest(cp_list) < 256) {
12589 _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
12593 /* Here, there are non-Latin1 code points, so we will have to go
12594 * fetch the list of all the characters that participate in folds
12596 if (! PL_utf8_foldable) {
12597 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
12598 &PL_sv_undef, 1, 0);
12599 PL_utf8_foldable = _get_swash_invlist(swash);
12600 SvREFCNT_dec(swash);
12603 /* This is a hash that for a particular fold gives all characters
12604 * that are involved in it */
12605 if (! PL_utf8_foldclosures) {
12607 /* If we were unable to find any folds, then we likely won't be
12608 * able to find the closures. So just create an empty list.
12609 * Folding will effectively be restricted to the non-Unicode
12610 * rules hard-coded into Perl. (This case happens legitimately
12611 * during compilation of Perl itself before the Unicode tables
12612 * are generated) */
12613 if (_invlist_len(PL_utf8_foldable) == 0) {
12614 PL_utf8_foldclosures = newHV();
12617 /* If the folds haven't been read in, call a fold function
12619 if (! PL_utf8_tofold) {
12620 U8 dummy[UTF8_MAXBYTES+1];
12622 /* This string is just a short named one above \xff */
12623 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
12624 assert(PL_utf8_tofold); /* Verify that worked */
12626 PL_utf8_foldclosures =
12627 _swash_inversion_hash(PL_utf8_tofold);
12631 /* Only the characters in this class that participate in folds need
12632 * be checked. Get the intersection of this class and all the
12633 * possible characters that are foldable. This can quickly narrow
12634 * down a large class */
12635 _invlist_intersection(PL_utf8_foldable, cp_list,
12636 &fold_intersection);
12639 /* Now look at the foldable characters in this class individually */
12640 invlist_iterinit(fold_intersection);
12641 while (invlist_iternext(fold_intersection, &start, &end)) {
12644 /* Locale folding for Latin1 characters is deferred until runtime */
12645 if (LOC && start < 256) {
12649 /* Look at every character in the range */
12650 for (j = start; j <= end; j++) {
12652 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
12659 /* We have the latin1 folding rules hard-coded here so that
12660 * an innocent-looking character class, like /[ks]/i won't
12661 * have to go out to disk to find the possible matches.
12662 * XXX It would be better to generate these via regen, in
12663 * case a new version of the Unicode standard adds new
12664 * mappings, though that is not really likely, and may be
12665 * caught by the default: case of the switch below. */
12667 if (IS_IN_SOME_FOLD_L1(j)) {
12669 /* ASCII is always matched; non-ASCII is matched only
12670 * under Unicode rules */
12671 if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
12673 add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
12677 add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
12681 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
12682 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
12684 /* Certain Latin1 characters have matches outside
12685 * Latin1. To get here, <j> is one of those
12686 * characters. None of these matches is valid for
12687 * ASCII characters under /aa, which is why the 'if'
12688 * just above excludes those. These matches only
12689 * happen when the target string is utf8. The code
12690 * below adds the single fold closures for <j> to the
12691 * inversion list. */
12696 add_cp_to_invlist(cp_list, KELVIN_SIGN);
12700 cp_list = add_cp_to_invlist(cp_list,
12701 LATIN_SMALL_LETTER_LONG_S);
12704 cp_list = add_cp_to_invlist(cp_list,
12705 GREEK_CAPITAL_LETTER_MU);
12706 cp_list = add_cp_to_invlist(cp_list,
12707 GREEK_SMALL_LETTER_MU);
12709 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
12710 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
12712 add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
12714 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
12715 cp_list = add_cp_to_invlist(cp_list,
12716 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
12718 case LATIN_SMALL_LETTER_SHARP_S:
12719 cp_list = add_cp_to_invlist(cp_list,
12720 LATIN_CAPITAL_LETTER_SHARP_S);
12722 case 'F': case 'f':
12723 case 'I': case 'i':
12724 case 'L': case 'l':
12725 case 'T': case 't':
12726 case 'A': case 'a':
12727 case 'H': case 'h':
12728 case 'J': case 'j':
12729 case 'N': case 'n':
12730 case 'W': case 'w':
12731 case 'Y': case 'y':
12732 /* These all are targets of multi-character
12733 * folds from code points that require UTF8 to
12734 * express, so they can't match unless the
12735 * target string is in UTF-8, so no action here
12736 * is necessary, as regexec.c properly handles
12737 * the general case for UTF-8 matching and
12738 * multi-char folds */
12741 /* Use deprecated warning to increase the
12742 * chances of this being output */
12743 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
12750 /* Here is an above Latin1 character. We don't have the rules
12751 * hard-coded for it. First, get its fold. This is the simple
12752 * fold, as the multi-character folds have been handled earlier
12753 * and separated out */
12754 f = _to_uni_fold_flags(j, foldbuf, &foldlen,
12756 ? FOLD_FLAGS_LOCALE
12757 : (ASCII_FOLD_RESTRICTED)
12758 ? FOLD_FLAGS_NOMIX_ASCII
12761 /* Single character fold of above Latin1. Add everything in
12762 * its fold closure to the list that this node should match.
12763 * The fold closures data structure is a hash with the keys
12764 * being the UTF-8 of every character that is folded to, like
12765 * 'k', and the values each an array of all code points that
12766 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
12767 * Multi-character folds are not included */
12768 if ((listp = hv_fetch(PL_utf8_foldclosures,
12769 (char *) foldbuf, foldlen, FALSE)))
12771 AV* list = (AV*) *listp;
12773 for (k = 0; k <= av_len(list); k++) {
12774 SV** c_p = av_fetch(list, k, FALSE);
12777 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
12781 /* /aa doesn't allow folds between ASCII and non-; /l
12782 * doesn't allow them between above and below 256 */
12783 if ((ASCII_FOLD_RESTRICTED
12784 && (isASCII(c) != isASCII(j)))
12785 || (LOC && ((c < 256) != (j < 256))))
12790 /* Folds involving non-ascii Latin1 characters
12791 * under /d are added to a separate list */
12792 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
12794 cp_list = add_cp_to_invlist(cp_list, c);
12797 depends_list = add_cp_to_invlist(depends_list, c);
12803 SvREFCNT_dec(fold_intersection);
12806 /* And combine the result (if any) with any inversion list from posix
12807 * classes. The lists are kept separate up to now because we don't want to
12808 * fold the classes (folding of those is automatically handled by the swash
12809 * fetching code) */
12811 if (! DEPENDS_SEMANTICS) {
12813 _invlist_union(cp_list, posixes, &cp_list);
12814 SvREFCNT_dec(posixes);
12821 /* Under /d, we put into a separate list the Latin1 things that
12822 * match only when the target string is utf8 */
12823 SV* nonascii_but_latin1_properties = NULL;
12824 _invlist_intersection(posixes, PL_Latin1,
12825 &nonascii_but_latin1_properties);
12826 _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
12827 &nonascii_but_latin1_properties);
12828 _invlist_subtract(posixes, nonascii_but_latin1_properties,
12831 _invlist_union(cp_list, posixes, &cp_list);
12832 SvREFCNT_dec(posixes);
12838 if (depends_list) {
12839 _invlist_union(depends_list, nonascii_but_latin1_properties,
12841 SvREFCNT_dec(nonascii_but_latin1_properties);
12844 depends_list = nonascii_but_latin1_properties;
12849 /* And combine the result (if any) with any inversion list from properties.
12850 * The lists are kept separate up to now so that we can distinguish the two
12851 * in regards to matching above-Unicode. A run-time warning is generated
12852 * if a Unicode property is matched against a non-Unicode code point. But,
12853 * we allow user-defined properties to match anything, without any warning,
12854 * and we also suppress the warning if there is a portion of the character
12855 * class that isn't a Unicode property, and which matches above Unicode, \W
12856 * or [\x{110000}] for example.
12857 * (Note that in this case, unlike the Posix one above, there is no
12858 * <depends_list>, because having a Unicode property forces Unicode
12861 bool warn_super = ! has_user_defined_property;
12864 /* If it matters to the final outcome, see if a non-property
12865 * component of the class matches above Unicode. If so, the
12866 * warning gets suppressed. This is true even if just a single
12867 * such code point is specified, as though not strictly correct if
12868 * another such code point is matched against, the fact that they
12869 * are using above-Unicode code points indicates they should know
12870 * the issues involved */
12872 bool non_prop_matches_above_Unicode =
12873 runtime_posix_matches_above_Unicode
12874 | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
12876 non_prop_matches_above_Unicode =
12877 ! non_prop_matches_above_Unicode;
12879 warn_super = ! non_prop_matches_above_Unicode;
12882 _invlist_union(properties, cp_list, &cp_list);
12883 SvREFCNT_dec(properties);
12886 cp_list = properties;
12890 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
12894 /* Here, we have calculated what code points should be in the character
12897 * Now we can see about various optimizations. Fold calculation (which we
12898 * did above) needs to take place before inversion. Otherwise /[^k]/i
12899 * would invert to include K, which under /i would match k, which it
12900 * shouldn't. Therefore we can't invert folded locale now, as it won't be
12901 * folded until runtime */
12903 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
12904 * at compile time. Besides not inverting folded locale now, we can't
12905 * invert if there are things such as \w, which aren't known until runtime
12908 && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
12910 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12912 _invlist_invert(cp_list);
12914 /* Any swash can't be used as-is, because we've inverted things */
12916 SvREFCNT_dec(swash);
12920 /* Clear the invert flag since have just done it here */
12924 /* If we didn't do folding, it's because some information isn't available
12925 * until runtime; set the run-time fold flag for these. (We don't have to
12926 * worry about properties folding, as that is taken care of by the swash
12930 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
12933 /* Some character classes are equivalent to other nodes. Such nodes take
12934 * up less room and generally fewer operations to execute than ANYOF nodes.
12935 * Above, we checked for and optimized into some such equivalents for
12936 * certain common classes that are easy to test. Getting to this point in
12937 * the code means that the class didn't get optimized there. Since this
12938 * code is only executed in Pass 2, it is too late to save space--it has
12939 * been allocated in Pass 1, and currently isn't given back. But turning
12940 * things into an EXACTish node can allow the optimizer to join it to any
12941 * adjacent such nodes. And if the class is equivalent to things like /./,
12942 * expensive run-time swashes can be avoided. Now that we have more
12943 * complete information, we can find things necessarily missed by the
12944 * earlier code. I (khw) am not sure how much to look for here. It would
12945 * be easy, but perhaps too slow, to check any candidates against all the
12946 * node types they could possibly match using _invlistEQ(). */
12951 && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
12952 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
12955 U8 op = END; /* The optimzation node-type */
12956 const char * cur_parse= RExC_parse;
12958 invlist_iterinit(cp_list);
12959 if (! invlist_iternext(cp_list, &start, &end)) {
12961 /* Here, the list is empty. This happens, for example, when a
12962 * Unicode property is the only thing in the character class, and
12963 * it doesn't match anything. (perluniprops.pod notes such
12966 *flagp |= HASWIDTH|SIMPLE;
12968 else if (start == end) { /* The range is a single code point */
12969 if (! invlist_iternext(cp_list, &start, &end)
12971 /* Don't do this optimization if it would require changing
12972 * the pattern to UTF-8 */
12973 && (start < 256 || UTF))
12975 /* Here, the list contains a single code point. Can optimize
12976 * into an EXACT node */
12985 /* A locale node under folding with one code point can be
12986 * an EXACTFL, as its fold won't be calculated until
12992 /* Here, we are generally folding, but there is only one
12993 * code point to match. If we have to, we use an EXACT
12994 * node, but it would be better for joining with adjacent
12995 * nodes in the optimization pass if we used the same
12996 * EXACTFish node that any such are likely to be. We can
12997 * do this iff the code point doesn't participate in any
12998 * folds. For example, an EXACTF of a colon is the same as
12999 * an EXACT one, since nothing folds to or from a colon. */
13001 if (IS_IN_SOME_FOLD_L1(value)) {
13006 if (! PL_utf8_foldable) {
13007 SV* swash = swash_init("utf8", "_Perl_Any_Folds",
13008 &PL_sv_undef, 1, 0);
13009 PL_utf8_foldable = _get_swash_invlist(swash);
13010 SvREFCNT_dec(swash);
13012 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
13017 /* If we haven't found the node type, above, it means we
13018 * can use the prevailing one */
13020 op = compute_EXACTish(pRExC_state);
13025 else if (start == 0) {
13026 if (end == UV_MAX) {
13028 *flagp |= HASWIDTH|SIMPLE;
13031 else if (end == '\n' - 1
13032 && invlist_iternext(cp_list, &start, &end)
13033 && start == '\n' + 1 && end == UV_MAX)
13036 *flagp |= HASWIDTH|SIMPLE;
13042 RExC_parse = (char *)orig_parse;
13043 RExC_emit = (regnode *)orig_emit;
13045 ret = reg_node(pRExC_state, op);
13047 RExC_parse = (char *)cur_parse;
13049 if (PL_regkind[op] == EXACT) {
13050 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
13053 SvREFCNT_dec(listsv);
13058 /* Here, <cp_list> contains all the code points we can determine at
13059 * compile time that match under all conditions. Go through it, and
13060 * for things that belong in the bitmap, put them there, and delete from
13061 * <cp_list>. While we are at it, see if everything above 255 is in the
13062 * list, and if so, set a flag to speed up execution */
13063 ANYOF_BITMAP_ZERO(ret);
13066 /* This gets set if we actually need to modify things */
13067 bool change_invlist = FALSE;
13071 /* Start looking through <cp_list> */
13072 invlist_iterinit(cp_list);
13073 while (invlist_iternext(cp_list, &start, &end)) {
13077 if (end == UV_MAX && start <= 256) {
13078 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
13081 /* Quit if are above what we should change */
13086 change_invlist = TRUE;
13088 /* Set all the bits in the range, up to the max that we are doing */
13089 high = (end < 255) ? end : 255;
13090 for (i = start; i <= (int) high; i++) {
13091 if (! ANYOF_BITMAP_TEST(ret, i)) {
13092 ANYOF_BITMAP_SET(ret, i);
13099 /* Done with loop; remove any code points that are in the bitmap from
13101 if (change_invlist) {
13102 _invlist_subtract(cp_list, PL_Latin1, &cp_list);
13105 /* If have completely emptied it, remove it completely */
13106 if (_invlist_len(cp_list) == 0) {
13107 SvREFCNT_dec(cp_list);
13113 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
13116 /* Here, the bitmap has been populated with all the Latin1 code points that
13117 * always match. Can now add to the overall list those that match only
13118 * when the target string is UTF-8 (<depends_list>). */
13119 if (depends_list) {
13121 _invlist_union(cp_list, depends_list, &cp_list);
13122 SvREFCNT_dec(depends_list);
13125 cp_list = depends_list;
13129 /* If there is a swash and more than one element, we can't use the swash in
13130 * the optimization below. */
13131 if (swash && element_count > 1) {
13132 SvREFCNT_dec(swash);
13137 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13139 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
13140 SvREFCNT_dec(listsv);
13143 /* av[0] stores the character class description in its textual form:
13144 * used later (regexec.c:Perl_regclass_swash()) to initialize the
13145 * appropriate swash, and is also useful for dumping the regnode.
13146 * av[1] if NULL, is a placeholder to later contain the swash computed
13147 * from av[0]. But if no further computation need be done, the
13148 * swash is stored there now.
13149 * av[2] stores the cp_list inversion list for use in addition or
13150 * instead of av[0]; used only if av[1] is NULL
13151 * av[3] is set if any component of the class is from a user-defined
13152 * property; used only if av[1] is NULL */
13153 AV * const av = newAV();
13156 av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
13160 av_store(av, 1, swash);
13161 SvREFCNT_dec(cp_list);
13164 av_store(av, 1, NULL);
13166 av_store(av, 2, cp_list);
13167 av_store(av, 3, newSVuv(has_user_defined_property));
13171 rv = newRV_noinc(MUTABLE_SV(av));
13172 n = add_data(pRExC_state, 1, "s");
13173 RExC_rxi->data->data[n] = (void*)rv;
13177 *flagp |= HASWIDTH|SIMPLE;
13180 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
13183 /* reg_skipcomment()
13185 Absorbs an /x style # comments from the input stream.
13186 Returns true if there is more text remaining in the stream.
13187 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
13188 terminates the pattern without including a newline.
13190 Note its the callers responsibility to ensure that we are
13191 actually in /x mode
13196 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
13200 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
13202 while (RExC_parse < RExC_end)
13203 if (*RExC_parse++ == '\n') {
13208 /* we ran off the end of the pattern without ending
13209 the comment, so we have to add an \n when wrapping */
13210 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
13218 Advances the parse position, and optionally absorbs
13219 "whitespace" from the inputstream.
13221 Without /x "whitespace" means (?#...) style comments only,
13222 with /x this means (?#...) and # comments and whitespace proper.
13224 Returns the RExC_parse point from BEFORE the scan occurs.
13226 This is the /x friendly way of saying RExC_parse++.
13230 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
13232 char* const retval = RExC_parse++;
13234 PERL_ARGS_ASSERT_NEXTCHAR;
13237 if (RExC_end - RExC_parse >= 3
13238 && *RExC_parse == '('
13239 && RExC_parse[1] == '?'
13240 && RExC_parse[2] == '#')
13242 while (*RExC_parse != ')') {
13243 if (RExC_parse == RExC_end)
13244 FAIL("Sequence (?#... not terminated");
13250 if (RExC_flags & RXf_PMf_EXTENDED) {
13251 if (isSPACE(*RExC_parse)) {
13255 else if (*RExC_parse == '#') {
13256 if ( reg_skipcomment( pRExC_state ) )
13265 - reg_node - emit a node
13267 STATIC regnode * /* Location. */
13268 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
13272 regnode * const ret = RExC_emit;
13273 GET_RE_DEBUG_FLAGS_DECL;
13275 PERL_ARGS_ASSERT_REG_NODE;
13278 SIZE_ALIGN(RExC_size);
13282 if (RExC_emit >= RExC_emit_bound)
13283 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13284 op, RExC_emit, RExC_emit_bound);
13286 NODE_ALIGN_FILL(ret);
13288 FILL_ADVANCE_NODE(ptr, op);
13289 #ifdef RE_TRACK_PATTERN_OFFSETS
13290 if (RExC_offsets) { /* MJD */
13291 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
13292 "reg_node", __LINE__,
13294 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
13295 ? "Overwriting end of array!\n" : "OK",
13296 (UV)(RExC_emit - RExC_emit_start),
13297 (UV)(RExC_parse - RExC_start),
13298 (UV)RExC_offsets[0]));
13299 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
13307 - reganode - emit a node with an argument
13309 STATIC regnode * /* Location. */
13310 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
13314 regnode * const ret = RExC_emit;
13315 GET_RE_DEBUG_FLAGS_DECL;
13317 PERL_ARGS_ASSERT_REGANODE;
13320 SIZE_ALIGN(RExC_size);
13325 assert(2==regarglen[op]+1);
13327 Anything larger than this has to allocate the extra amount.
13328 If we changed this to be:
13330 RExC_size += (1 + regarglen[op]);
13332 then it wouldn't matter. Its not clear what side effect
13333 might come from that so its not done so far.
13338 if (RExC_emit >= RExC_emit_bound)
13339 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
13340 op, RExC_emit, RExC_emit_bound);
13342 NODE_ALIGN_FILL(ret);
13344 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
13345 #ifdef RE_TRACK_PATTERN_OFFSETS
13346 if (RExC_offsets) { /* MJD */
13347 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13351 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
13352 "Overwriting end of array!\n" : "OK",
13353 (UV)(RExC_emit - RExC_emit_start),
13354 (UV)(RExC_parse - RExC_start),
13355 (UV)RExC_offsets[0]));
13356 Set_Cur_Node_Offset;
13364 - reguni - emit (if appropriate) a Unicode character
13367 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
13371 PERL_ARGS_ASSERT_REGUNI;
13373 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
13377 - reginsert - insert an operator in front of already-emitted operand
13379 * Means relocating the operand.
13382 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
13388 const int offset = regarglen[(U8)op];
13389 const int size = NODE_STEP_REGNODE + offset;
13390 GET_RE_DEBUG_FLAGS_DECL;
13392 PERL_ARGS_ASSERT_REGINSERT;
13393 PERL_UNUSED_ARG(depth);
13394 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13395 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
13404 if (RExC_open_parens) {
13406 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
13407 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
13408 if ( RExC_open_parens[paren] >= opnd ) {
13409 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
13410 RExC_open_parens[paren] += size;
13412 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
13414 if ( RExC_close_parens[paren] >= opnd ) {
13415 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
13416 RExC_close_parens[paren] += size;
13418 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
13423 while (src > opnd) {
13424 StructCopy(--src, --dst, regnode);
13425 #ifdef RE_TRACK_PATTERN_OFFSETS
13426 if (RExC_offsets) { /* MJD 20010112 */
13427 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
13431 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
13432 ? "Overwriting end of array!\n" : "OK",
13433 (UV)(src - RExC_emit_start),
13434 (UV)(dst - RExC_emit_start),
13435 (UV)RExC_offsets[0]));
13436 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
13437 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
13443 place = opnd; /* Op node, where operand used to be. */
13444 #ifdef RE_TRACK_PATTERN_OFFSETS
13445 if (RExC_offsets) { /* MJD */
13446 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
13450 (UV)(place - RExC_emit_start) > RExC_offsets[0]
13451 ? "Overwriting end of array!\n" : "OK",
13452 (UV)(place - RExC_emit_start),
13453 (UV)(RExC_parse - RExC_start),
13454 (UV)RExC_offsets[0]));
13455 Set_Node_Offset(place, RExC_parse);
13456 Set_Node_Length(place, 1);
13459 src = NEXTOPER(place);
13460 FILL_ADVANCE_NODE(place, op);
13461 Zero(src, offset, regnode);
13465 - regtail - set the next-pointer at the end of a node chain of p to val.
13466 - SEE ALSO: regtail_study
13468 /* TODO: All three parms should be const */
13470 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13474 GET_RE_DEBUG_FLAGS_DECL;
13476 PERL_ARGS_ASSERT_REGTAIL;
13478 PERL_UNUSED_ARG(depth);
13484 /* Find last node. */
13487 regnode * const temp = regnext(scan);
13489 SV * const mysv=sv_newmortal();
13490 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
13491 regprop(RExC_rx, mysv, scan);
13492 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
13493 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
13494 (temp == NULL ? "->" : ""),
13495 (temp == NULL ? PL_reg_name[OP(val)] : "")
13503 if (reg_off_by_arg[OP(scan)]) {
13504 ARG_SET(scan, val - scan);
13507 NEXT_OFF(scan) = val - scan;
13513 - regtail_study - set the next-pointer at the end of a node chain of p to val.
13514 - Look for optimizable sequences at the same time.
13515 - currently only looks for EXACT chains.
13517 This is experimental code. The idea is to use this routine to perform
13518 in place optimizations on branches and groups as they are constructed,
13519 with the long term intention of removing optimization from study_chunk so
13520 that it is purely analytical.
13522 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
13523 to control which is which.
13526 /* TODO: All four parms should be const */
13529 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
13534 #ifdef EXPERIMENTAL_INPLACESCAN
13537 GET_RE_DEBUG_FLAGS_DECL;
13539 PERL_ARGS_ASSERT_REGTAIL_STUDY;
13545 /* Find last node. */
13549 regnode * const temp = regnext(scan);
13550 #ifdef EXPERIMENTAL_INPLACESCAN
13551 if (PL_regkind[OP(scan)] == EXACT) {
13552 bool has_exactf_sharp_s; /* Unexamined in this routine */
13553 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
13558 switch (OP(scan)) {
13564 case EXACTFU_TRICKYFOLD:
13566 if( exact == PSEUDO )
13568 else if ( exact != OP(scan) )
13577 SV * const mysv=sv_newmortal();
13578 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
13579 regprop(RExC_rx, mysv, scan);
13580 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
13581 SvPV_nolen_const(mysv),
13582 REG_NODE_NUM(scan),
13583 PL_reg_name[exact]);
13590 SV * const mysv_val=sv_newmortal();
13591 DEBUG_PARSE_MSG("");
13592 regprop(RExC_rx, mysv_val, val);
13593 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
13594 SvPV_nolen_const(mysv_val),
13595 (IV)REG_NODE_NUM(val),
13599 if (reg_off_by_arg[OP(scan)]) {
13600 ARG_SET(scan, val - scan);
13603 NEXT_OFF(scan) = val - scan;
13611 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
13615 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
13621 for (bit=0; bit<32; bit++) {
13622 if (flags & (1<<bit)) {
13623 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
13626 if (!set++ && lead)
13627 PerlIO_printf(Perl_debug_log, "%s",lead);
13628 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
13631 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
13632 if (!set++ && lead) {
13633 PerlIO_printf(Perl_debug_log, "%s",lead);
13636 case REGEX_UNICODE_CHARSET:
13637 PerlIO_printf(Perl_debug_log, "UNICODE");
13639 case REGEX_LOCALE_CHARSET:
13640 PerlIO_printf(Perl_debug_log, "LOCALE");
13642 case REGEX_ASCII_RESTRICTED_CHARSET:
13643 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
13645 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
13646 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
13649 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
13655 PerlIO_printf(Perl_debug_log, "\n");
13657 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
13663 Perl_regdump(pTHX_ const regexp *r)
13667 SV * const sv = sv_newmortal();
13668 SV *dsv= sv_newmortal();
13669 RXi_GET_DECL(r,ri);
13670 GET_RE_DEBUG_FLAGS_DECL;
13672 PERL_ARGS_ASSERT_REGDUMP;
13674 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
13676 /* Header fields of interest. */
13677 if (r->anchored_substr) {
13678 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
13679 RE_SV_DUMPLEN(r->anchored_substr), 30);
13680 PerlIO_printf(Perl_debug_log,
13681 "anchored %s%s at %"IVdf" ",
13682 s, RE_SV_TAIL(r->anchored_substr),
13683 (IV)r->anchored_offset);
13684 } else if (r->anchored_utf8) {
13685 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
13686 RE_SV_DUMPLEN(r->anchored_utf8), 30);
13687 PerlIO_printf(Perl_debug_log,
13688 "anchored utf8 %s%s at %"IVdf" ",
13689 s, RE_SV_TAIL(r->anchored_utf8),
13690 (IV)r->anchored_offset);
13692 if (r->float_substr) {
13693 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
13694 RE_SV_DUMPLEN(r->float_substr), 30);
13695 PerlIO_printf(Perl_debug_log,
13696 "floating %s%s at %"IVdf"..%"UVuf" ",
13697 s, RE_SV_TAIL(r->float_substr),
13698 (IV)r->float_min_offset, (UV)r->float_max_offset);
13699 } else if (r->float_utf8) {
13700 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
13701 RE_SV_DUMPLEN(r->float_utf8), 30);
13702 PerlIO_printf(Perl_debug_log,
13703 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
13704 s, RE_SV_TAIL(r->float_utf8),
13705 (IV)r->float_min_offset, (UV)r->float_max_offset);
13707 if (r->check_substr || r->check_utf8)
13708 PerlIO_printf(Perl_debug_log,
13710 (r->check_substr == r->float_substr
13711 && r->check_utf8 == r->float_utf8
13712 ? "(checking floating" : "(checking anchored"));
13713 if (r->extflags & RXf_NOSCAN)
13714 PerlIO_printf(Perl_debug_log, " noscan");
13715 if (r->extflags & RXf_CHECK_ALL)
13716 PerlIO_printf(Perl_debug_log, " isall");
13717 if (r->check_substr || r->check_utf8)
13718 PerlIO_printf(Perl_debug_log, ") ");
13720 if (ri->regstclass) {
13721 regprop(r, sv, ri->regstclass);
13722 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
13724 if (r->extflags & RXf_ANCH) {
13725 PerlIO_printf(Perl_debug_log, "anchored");
13726 if (r->extflags & RXf_ANCH_BOL)
13727 PerlIO_printf(Perl_debug_log, "(BOL)");
13728 if (r->extflags & RXf_ANCH_MBOL)
13729 PerlIO_printf(Perl_debug_log, "(MBOL)");
13730 if (r->extflags & RXf_ANCH_SBOL)
13731 PerlIO_printf(Perl_debug_log, "(SBOL)");
13732 if (r->extflags & RXf_ANCH_GPOS)
13733 PerlIO_printf(Perl_debug_log, "(GPOS)");
13734 PerlIO_putc(Perl_debug_log, ' ');
13736 if (r->extflags & RXf_GPOS_SEEN)
13737 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
13738 if (r->intflags & PREGf_SKIP)
13739 PerlIO_printf(Perl_debug_log, "plus ");
13740 if (r->intflags & PREGf_IMPLICIT)
13741 PerlIO_printf(Perl_debug_log, "implicit ");
13742 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
13743 if (r->extflags & RXf_EVAL_SEEN)
13744 PerlIO_printf(Perl_debug_log, "with eval ");
13745 PerlIO_printf(Perl_debug_log, "\n");
13746 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
13748 PERL_ARGS_ASSERT_REGDUMP;
13749 PERL_UNUSED_CONTEXT;
13750 PERL_UNUSED_ARG(r);
13751 #endif /* DEBUGGING */
13755 - regprop - printable representation of opcode
13757 #define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
13760 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
13761 if (flags & ANYOF_INVERT) \
13762 /*make sure the invert info is in each */ \
13763 sv_catpvs(sv, "^"); \
13769 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
13775 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
13776 static const char * const anyofs[] = {
13808 RXi_GET_DECL(prog,progi);
13809 GET_RE_DEBUG_FLAGS_DECL;
13811 PERL_ARGS_ASSERT_REGPROP;
13815 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
13816 /* It would be nice to FAIL() here, but this may be called from
13817 regexec.c, and it would be hard to supply pRExC_state. */
13818 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13819 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
13821 k = PL_regkind[OP(o)];
13824 sv_catpvs(sv, " ");
13825 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
13826 * is a crude hack but it may be the best for now since
13827 * we have no flag "this EXACTish node was UTF-8"
13829 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
13830 PERL_PV_ESCAPE_UNI_DETECT |
13831 PERL_PV_ESCAPE_NONASCII |
13832 PERL_PV_PRETTY_ELLIPSES |
13833 PERL_PV_PRETTY_LTGT |
13834 PERL_PV_PRETTY_NOCLEAR
13836 } else if (k == TRIE) {
13837 /* print the details of the trie in dumpuntil instead, as
13838 * progi->data isn't available here */
13839 const char op = OP(o);
13840 const U32 n = ARG(o);
13841 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
13842 (reg_ac_data *)progi->data->data[n] :
13844 const reg_trie_data * const trie
13845 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
13847 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
13848 DEBUG_TRIE_COMPILE_r(
13849 Perl_sv_catpvf(aTHX_ sv,
13850 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
13851 (UV)trie->startstate,
13852 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
13853 (UV)trie->wordcount,
13856 (UV)TRIE_CHARCOUNT(trie),
13857 (UV)trie->uniquecharcount
13860 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
13862 int rangestart = -1;
13863 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
13864 sv_catpvs(sv, "[");
13865 for (i = 0; i <= 256; i++) {
13866 if (i < 256 && BITMAP_TEST(bitmap,i)) {
13867 if (rangestart == -1)
13869 } else if (rangestart != -1) {
13870 if (i <= rangestart + 3)
13871 for (; rangestart < i; rangestart++)
13872 put_byte(sv, rangestart);
13874 put_byte(sv, rangestart);
13875 sv_catpvs(sv, "-");
13876 put_byte(sv, i - 1);
13881 sv_catpvs(sv, "]");
13884 } else if (k == CURLY) {
13885 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
13886 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
13887 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
13889 else if (k == WHILEM && o->flags) /* Ordinal/of */
13890 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
13891 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
13892 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
13893 if ( RXp_PAREN_NAMES(prog) ) {
13894 if ( k != REF || (OP(o) < NREF)) {
13895 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
13896 SV **name= av_fetch(list, ARG(o), 0 );
13898 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13901 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
13902 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
13903 I32 *nums=(I32*)SvPVX(sv_dat);
13904 SV **name= av_fetch(list, nums[0], 0 );
13907 for ( n=0; n<SvIVX(sv_dat); n++ ) {
13908 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
13909 (n ? "," : ""), (IV)nums[n]);
13911 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
13915 } else if (k == GOSUB)
13916 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
13917 else if (k == VERB) {
13919 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
13920 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
13921 } else if (k == LOGICAL)
13922 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
13923 else if (k == ANYOF) {
13924 int i, rangestart = -1;
13925 const U8 flags = ANYOF_FLAGS(o);
13929 if (flags & ANYOF_LOCALE)
13930 sv_catpvs(sv, "{loc}");
13931 if (flags & ANYOF_LOC_FOLD)
13932 sv_catpvs(sv, "{i}");
13933 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
13934 if (flags & ANYOF_INVERT)
13935 sv_catpvs(sv, "^");
13937 /* output what the standard cp 0-255 bitmap matches */
13938 for (i = 0; i <= 256; i++) {
13939 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
13940 if (rangestart == -1)
13942 } else if (rangestart != -1) {
13943 if (i <= rangestart + 3)
13944 for (; rangestart < i; rangestart++)
13945 put_byte(sv, rangestart);
13947 put_byte(sv, rangestart);
13948 sv_catpvs(sv, "-");
13949 put_byte(sv, i - 1);
13956 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13957 /* output any special charclass tests (used entirely under use locale) */
13958 if (ANYOF_CLASS_TEST_ANY_SET(o))
13959 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
13960 if (ANYOF_CLASS_TEST(o,i)) {
13961 sv_catpv(sv, anyofs[i]);
13965 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
13967 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
13968 sv_catpvs(sv, "{non-utf8-latin1-all}");
13971 /* output information about the unicode matching */
13972 if (flags & ANYOF_UNICODE_ALL)
13973 sv_catpvs(sv, "{unicode_all}");
13974 else if (ANYOF_NONBITMAP(o))
13975 sv_catpvs(sv, "{unicode}");
13976 if (flags & ANYOF_NONBITMAP_NON_UTF8)
13977 sv_catpvs(sv, "{outside bitmap}");
13979 if (ANYOF_NONBITMAP(o)) {
13980 SV *lv; /* Set if there is something outside the bit map */
13981 SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
13982 bool byte_output = FALSE; /* If something in the bitmap has been
13985 if (lv && lv != &PL_sv_undef) {
13987 U8 s[UTF8_MAXBYTES_CASE+1];
13989 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
13990 uvchr_to_utf8(s, i);
13993 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
13997 && swash_fetch(sw, s, TRUE))
13999 if (rangestart == -1)
14001 } else if (rangestart != -1) {
14002 byte_output = TRUE;
14003 if (i <= rangestart + 3)
14004 for (; rangestart < i; rangestart++) {
14005 put_byte(sv, rangestart);
14008 put_byte(sv, rangestart);
14009 sv_catpvs(sv, "-");
14018 char *s = savesvpv(lv);
14019 char * const origs = s;
14021 while (*s && *s != '\n')
14025 const char * const t = ++s;
14028 sv_catpvs(sv, " ");
14034 /* Truncate very long output */
14035 if (s - origs > 256) {
14036 Perl_sv_catpvf(aTHX_ sv,
14038 (int) (s - origs - 1),
14044 else if (*s == '\t') {
14063 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
14065 else if (k == POSIXD) {
14066 U8 index = FLAGS(o) * 2;
14067 if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
14068 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
14071 sv_catpv(sv, anyofs[index]);
14074 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
14075 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
14077 PERL_UNUSED_CONTEXT;
14078 PERL_UNUSED_ARG(sv);
14079 PERL_UNUSED_ARG(o);
14080 PERL_UNUSED_ARG(prog);
14081 #endif /* DEBUGGING */
14085 Perl_re_intuit_string(pTHX_ REGEXP * const r)
14086 { /* Assume that RE_INTUIT is set */
14088 struct regexp *const prog = (struct regexp *)SvANY(r);
14089 GET_RE_DEBUG_FLAGS_DECL;
14091 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
14092 PERL_UNUSED_CONTEXT;
14096 const char * const s = SvPV_nolen_const(prog->check_substr
14097 ? prog->check_substr : prog->check_utf8);
14099 if (!PL_colorset) reginitcolors();
14100 PerlIO_printf(Perl_debug_log,
14101 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
14103 prog->check_substr ? "" : "utf8 ",
14104 PL_colors[5],PL_colors[0],
14107 (strlen(s) > 60 ? "..." : ""));
14110 return prog->check_substr ? prog->check_substr : prog->check_utf8;
14116 handles refcounting and freeing the perl core regexp structure. When
14117 it is necessary to actually free the structure the first thing it
14118 does is call the 'free' method of the regexp_engine associated to
14119 the regexp, allowing the handling of the void *pprivate; member
14120 first. (This routine is not overridable by extensions, which is why
14121 the extensions free is called first.)
14123 See regdupe and regdupe_internal if you change anything here.
14125 #ifndef PERL_IN_XSUB_RE
14127 Perl_pregfree(pTHX_ REGEXP *r)
14133 Perl_pregfree2(pTHX_ REGEXP *rx)
14136 struct regexp *const r = (struct regexp *)SvANY(rx);
14137 GET_RE_DEBUG_FLAGS_DECL;
14139 PERL_ARGS_ASSERT_PREGFREE2;
14141 if (r->mother_re) {
14142 ReREFCNT_dec(r->mother_re);
14144 CALLREGFREE_PVT(rx); /* free the private data */
14145 SvREFCNT_dec(RXp_PAREN_NAMES(r));
14148 SvREFCNT_dec(r->anchored_substr);
14149 SvREFCNT_dec(r->anchored_utf8);
14150 SvREFCNT_dec(r->float_substr);
14151 SvREFCNT_dec(r->float_utf8);
14152 Safefree(r->substrs);
14154 RX_MATCH_COPY_FREE(rx);
14155 #ifdef PERL_OLD_COPY_ON_WRITE
14156 SvREFCNT_dec(r->saved_copy);
14159 SvREFCNT_dec(r->qr_anoncv);
14164 This is a hacky workaround to the structural issue of match results
14165 being stored in the regexp structure which is in turn stored in
14166 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
14167 could be PL_curpm in multiple contexts, and could require multiple
14168 result sets being associated with the pattern simultaneously, such
14169 as when doing a recursive match with (??{$qr})
14171 The solution is to make a lightweight copy of the regexp structure
14172 when a qr// is returned from the code executed by (??{$qr}) this
14173 lightweight copy doesn't actually own any of its data except for
14174 the starp/end and the actual regexp structure itself.
14180 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
14182 struct regexp *ret;
14183 struct regexp *const r = (struct regexp *)SvANY(rx);
14185 PERL_ARGS_ASSERT_REG_TEMP_COPY;
14188 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
14189 ret = (struct regexp *)SvANY(ret_x);
14191 (void)ReREFCNT_inc(rx);
14192 /* We can take advantage of the existing "copied buffer" mechanism in SVs
14193 by pointing directly at the buffer, but flagging that the allocated
14194 space in the copy is zero. As we've just done a struct copy, it's now
14195 a case of zero-ing that, rather than copying the current length. */
14196 SvPV_set(ret_x, RX_WRAPPED(rx));
14197 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
14198 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
14199 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
14200 SvLEN_set(ret_x, 0);
14201 SvSTASH_set(ret_x, NULL);
14202 SvMAGIC_set(ret_x, NULL);
14204 const I32 npar = r->nparens+1;
14205 Newx(ret->offs, npar, regexp_paren_pair);
14206 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14209 Newx(ret->substrs, 1, struct reg_substr_data);
14210 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14212 SvREFCNT_inc_void(ret->anchored_substr);
14213 SvREFCNT_inc_void(ret->anchored_utf8);
14214 SvREFCNT_inc_void(ret->float_substr);
14215 SvREFCNT_inc_void(ret->float_utf8);
14217 /* check_substr and check_utf8, if non-NULL, point to either their
14218 anchored or float namesakes, and don't hold a second reference. */
14220 RX_MATCH_COPIED_off(ret_x);
14221 #ifdef PERL_OLD_COPY_ON_WRITE
14222 ret->saved_copy = NULL;
14224 ret->mother_re = rx;
14225 SvREFCNT_inc_void(ret->qr_anoncv);
14231 /* regfree_internal()
14233 Free the private data in a regexp. This is overloadable by
14234 extensions. Perl takes care of the regexp structure in pregfree(),
14235 this covers the *pprivate pointer which technically perl doesn't
14236 know about, however of course we have to handle the
14237 regexp_internal structure when no extension is in use.
14239 Note this is called before freeing anything in the regexp
14244 Perl_regfree_internal(pTHX_ REGEXP * const rx)
14247 struct regexp *const r = (struct regexp *)SvANY(rx);
14248 RXi_GET_DECL(r,ri);
14249 GET_RE_DEBUG_FLAGS_DECL;
14251 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
14257 SV *dsv= sv_newmortal();
14258 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
14259 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
14260 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
14261 PL_colors[4],PL_colors[5],s);
14264 #ifdef RE_TRACK_PATTERN_OFFSETS
14266 Safefree(ri->u.offsets); /* 20010421 MJD */
14268 if (ri->code_blocks) {
14270 for (n = 0; n < ri->num_code_blocks; n++)
14271 SvREFCNT_dec(ri->code_blocks[n].src_regex);
14272 Safefree(ri->code_blocks);
14276 int n = ri->data->count;
14279 /* If you add a ->what type here, update the comment in regcomp.h */
14280 switch (ri->data->what[n]) {
14286 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
14289 Safefree(ri->data->data[n]);
14295 { /* Aho Corasick add-on structure for a trie node.
14296 Used in stclass optimization only */
14298 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
14300 refcount = --aho->refcount;
14303 PerlMemShared_free(aho->states);
14304 PerlMemShared_free(aho->fail);
14305 /* do this last!!!! */
14306 PerlMemShared_free(ri->data->data[n]);
14307 PerlMemShared_free(ri->regstclass);
14313 /* trie structure. */
14315 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
14317 refcount = --trie->refcount;
14320 PerlMemShared_free(trie->charmap);
14321 PerlMemShared_free(trie->states);
14322 PerlMemShared_free(trie->trans);
14324 PerlMemShared_free(trie->bitmap);
14326 PerlMemShared_free(trie->jump);
14327 PerlMemShared_free(trie->wordinfo);
14328 /* do this last!!!! */
14329 PerlMemShared_free(ri->data->data[n]);
14334 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
14337 Safefree(ri->data->what);
14338 Safefree(ri->data);
14344 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
14345 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
14346 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
14349 re_dup - duplicate a regexp.
14351 This routine is expected to clone a given regexp structure. It is only
14352 compiled under USE_ITHREADS.
14354 After all of the core data stored in struct regexp is duplicated
14355 the regexp_engine.dupe method is used to copy any private data
14356 stored in the *pprivate pointer. This allows extensions to handle
14357 any duplication it needs to do.
14359 See pregfree() and regfree_internal() if you change anything here.
14361 #if defined(USE_ITHREADS)
14362 #ifndef PERL_IN_XSUB_RE
14364 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
14368 const struct regexp *r = (const struct regexp *)SvANY(sstr);
14369 struct regexp *ret = (struct regexp *)SvANY(dstr);
14371 PERL_ARGS_ASSERT_RE_DUP_GUTS;
14373 npar = r->nparens+1;
14374 Newx(ret->offs, npar, regexp_paren_pair);
14375 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
14377 /* no need to copy these */
14378 Newx(ret->swap, npar, regexp_paren_pair);
14381 if (ret->substrs) {
14382 /* Do it this way to avoid reading from *r after the StructCopy().
14383 That way, if any of the sv_dup_inc()s dislodge *r from the L1
14384 cache, it doesn't matter. */
14385 const bool anchored = r->check_substr
14386 ? r->check_substr == r->anchored_substr
14387 : r->check_utf8 == r->anchored_utf8;
14388 Newx(ret->substrs, 1, struct reg_substr_data);
14389 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
14391 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
14392 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
14393 ret->float_substr = sv_dup_inc(ret->float_substr, param);
14394 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
14396 /* check_substr and check_utf8, if non-NULL, point to either their
14397 anchored or float namesakes, and don't hold a second reference. */
14399 if (ret->check_substr) {
14401 assert(r->check_utf8 == r->anchored_utf8);
14402 ret->check_substr = ret->anchored_substr;
14403 ret->check_utf8 = ret->anchored_utf8;
14405 assert(r->check_substr == r->float_substr);
14406 assert(r->check_utf8 == r->float_utf8);
14407 ret->check_substr = ret->float_substr;
14408 ret->check_utf8 = ret->float_utf8;
14410 } else if (ret->check_utf8) {
14412 ret->check_utf8 = ret->anchored_utf8;
14414 ret->check_utf8 = ret->float_utf8;
14419 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
14420 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
14423 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
14425 if (RX_MATCH_COPIED(dstr))
14426 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
14428 ret->subbeg = NULL;
14429 #ifdef PERL_OLD_COPY_ON_WRITE
14430 ret->saved_copy = NULL;
14433 if (ret->mother_re) {
14434 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
14435 /* Our storage points directly to our mother regexp, but that's
14436 1: a buffer in a different thread
14437 2: something we no longer hold a reference on
14438 so we need to copy it locally. */
14439 /* Note we need to use SvCUR(), rather than
14440 SvLEN(), on our mother_re, because it, in
14441 turn, may well be pointing to its own mother_re. */
14442 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
14443 SvCUR(ret->mother_re)+1));
14444 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
14446 ret->mother_re = NULL;
14450 #endif /* PERL_IN_XSUB_RE */
14455 This is the internal complement to regdupe() which is used to copy
14456 the structure pointed to by the *pprivate pointer in the regexp.
14457 This is the core version of the extension overridable cloning hook.
14458 The regexp structure being duplicated will be copied by perl prior
14459 to this and will be provided as the regexp *r argument, however
14460 with the /old/ structures pprivate pointer value. Thus this routine
14461 may override any copying normally done by perl.
14463 It returns a pointer to the new regexp_internal structure.
14467 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
14470 struct regexp *const r = (struct regexp *)SvANY(rx);
14471 regexp_internal *reti;
14473 RXi_GET_DECL(r,ri);
14475 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
14479 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
14480 Copy(ri->program, reti->program, len+1, regnode);
14482 reti->num_code_blocks = ri->num_code_blocks;
14483 if (ri->code_blocks) {
14485 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
14486 struct reg_code_block);
14487 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
14488 struct reg_code_block);
14489 for (n = 0; n < ri->num_code_blocks; n++)
14490 reti->code_blocks[n].src_regex = (REGEXP*)
14491 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
14494 reti->code_blocks = NULL;
14496 reti->regstclass = NULL;
14499 struct reg_data *d;
14500 const int count = ri->data->count;
14503 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
14504 char, struct reg_data);
14505 Newx(d->what, count, U8);
14508 for (i = 0; i < count; i++) {
14509 d->what[i] = ri->data->what[i];
14510 switch (d->what[i]) {
14511 /* see also regcomp.h and regfree_internal() */
14512 case 'a': /* actually an AV, but the dup function is identical. */
14516 case 'u': /* actually an HV, but the dup function is identical. */
14517 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
14520 /* This is cheating. */
14521 Newx(d->data[i], 1, struct regnode_charclass_class);
14522 StructCopy(ri->data->data[i], d->data[i],
14523 struct regnode_charclass_class);
14524 reti->regstclass = (regnode*)d->data[i];
14527 /* Trie stclasses are readonly and can thus be shared
14528 * without duplication. We free the stclass in pregfree
14529 * when the corresponding reg_ac_data struct is freed.
14531 reti->regstclass= ri->regstclass;
14535 ((reg_trie_data*)ri->data->data[i])->refcount++;
14540 d->data[i] = ri->data->data[i];
14543 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
14552 reti->name_list_idx = ri->name_list_idx;
14554 #ifdef RE_TRACK_PATTERN_OFFSETS
14555 if (ri->u.offsets) {
14556 Newx(reti->u.offsets, 2*len+1, U32);
14557 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
14560 SetProgLen(reti,len);
14563 return (void*)reti;
14566 #endif /* USE_ITHREADS */
14568 #ifndef PERL_IN_XSUB_RE
14571 - regnext - dig the "next" pointer out of a node
14574 Perl_regnext(pTHX_ register regnode *p)
14582 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
14583 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
14586 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
14595 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
14598 STRLEN l1 = strlen(pat1);
14599 STRLEN l2 = strlen(pat2);
14602 const char *message;
14604 PERL_ARGS_ASSERT_RE_CROAK2;
14610 Copy(pat1, buf, l1 , char);
14611 Copy(pat2, buf + l1, l2 , char);
14612 buf[l1 + l2] = '\n';
14613 buf[l1 + l2 + 1] = '\0';
14615 /* ANSI variant takes additional second argument */
14616 va_start(args, pat2);
14620 msv = vmess(buf, &args);
14622 message = SvPV_const(msv,l1);
14625 Copy(message, buf, l1 , char);
14626 buf[l1-1] = '\0'; /* Overwrite \n */
14627 Perl_croak(aTHX_ "%s", buf);
14630 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
14632 #ifndef PERL_IN_XSUB_RE
14634 Perl_save_re_context(pTHX)
14638 struct re_save_state *state;
14640 SAVEVPTR(PL_curcop);
14641 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
14643 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
14644 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
14645 SSPUSHUV(SAVEt_RE_STATE);
14647 Copy(&PL_reg_state, state, 1, struct re_save_state);
14649 PL_reg_oldsaved = NULL;
14650 PL_reg_oldsavedlen = 0;
14651 PL_reg_oldsavedoffset = 0;
14652 PL_reg_oldsavedcoffset = 0;
14653 PL_reg_maxiter = 0;
14654 PL_reg_leftiter = 0;
14655 PL_reg_poscache = NULL;
14656 PL_reg_poscache_size = 0;
14657 #ifdef PERL_OLD_COPY_ON_WRITE
14661 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
14663 const REGEXP * const rx = PM_GETRE(PL_curpm);
14666 for (i = 1; i <= RX_NPARENS(rx); i++) {
14667 char digits[TYPE_CHARS(long)];
14668 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
14669 GV *const *const gvp
14670 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
14673 GV * const gv = *gvp;
14674 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
14684 clear_re(pTHX_ void *r)
14687 ReREFCNT_dec((REGEXP *)r);
14693 S_put_byte(pTHX_ SV *sv, int c)
14695 PERL_ARGS_ASSERT_PUT_BYTE;
14697 /* Our definition of isPRINT() ignores locales, so only bytes that are
14698 not part of UTF-8 are considered printable. I assume that the same
14699 holds for UTF-EBCDIC.
14700 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
14701 which Wikipedia says:
14703 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
14704 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
14705 identical, to the ASCII delete (DEL) or rubout control character.
14706 ) So the old condition can be simplified to !isPRINT(c) */
14709 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
14712 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
14716 const char string = c;
14717 if (c == '-' || c == ']' || c == '\\' || c == '^')
14718 sv_catpvs(sv, "\\");
14719 sv_catpvn(sv, &string, 1);
14724 #define CLEAR_OPTSTART \
14725 if (optstart) STMT_START { \
14726 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
14730 #define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
14732 STATIC const regnode *
14733 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
14734 const regnode *last, const regnode *plast,
14735 SV* sv, I32 indent, U32 depth)
14738 U8 op = PSEUDO; /* Arbitrary non-END op. */
14739 const regnode *next;
14740 const regnode *optstart= NULL;
14742 RXi_GET_DECL(r,ri);
14743 GET_RE_DEBUG_FLAGS_DECL;
14745 PERL_ARGS_ASSERT_DUMPUNTIL;
14747 #ifdef DEBUG_DUMPUNTIL
14748 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
14749 last ? last-start : 0,plast ? plast-start : 0);
14752 if (plast && plast < last)
14755 while (PL_regkind[op] != END && (!last || node < last)) {
14756 /* While that wasn't END last time... */
14759 if (op == CLOSE || op == WHILEM)
14761 next = regnext((regnode *)node);
14764 if (OP(node) == OPTIMIZED) {
14765 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
14772 regprop(r, sv, node);
14773 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
14774 (int)(2*indent + 1), "", SvPVX_const(sv));
14776 if (OP(node) != OPTIMIZED) {
14777 if (next == NULL) /* Next ptr. */
14778 PerlIO_printf(Perl_debug_log, " (0)");
14779 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
14780 PerlIO_printf(Perl_debug_log, " (FAIL)");
14782 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
14783 (void)PerlIO_putc(Perl_debug_log, '\n');
14787 if (PL_regkind[(U8)op] == BRANCHJ) {
14790 const regnode *nnode = (OP(next) == LONGJMP
14791 ? regnext((regnode *)next)
14793 if (last && nnode > last)
14795 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
14798 else if (PL_regkind[(U8)op] == BRANCH) {
14800 DUMPUNTIL(NEXTOPER(node), next);
14802 else if ( PL_regkind[(U8)op] == TRIE ) {
14803 const regnode *this_trie = node;
14804 const char op = OP(node);
14805 const U32 n = ARG(node);
14806 const reg_ac_data * const ac = op>=AHOCORASICK ?
14807 (reg_ac_data *)ri->data->data[n] :
14809 const reg_trie_data * const trie =
14810 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
14812 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
14814 const regnode *nextbranch= NULL;
14817 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
14818 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
14820 PerlIO_printf(Perl_debug_log, "%*s%s ",
14821 (int)(2*(indent+3)), "",
14822 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
14823 PL_colors[0], PL_colors[1],
14824 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
14825 PERL_PV_PRETTY_ELLIPSES |
14826 PERL_PV_PRETTY_LTGT
14831 U16 dist= trie->jump[word_idx+1];
14832 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
14833 (UV)((dist ? this_trie + dist : next) - start));
14836 nextbranch= this_trie + trie->jump[0];
14837 DUMPUNTIL(this_trie + dist, nextbranch);
14839 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
14840 nextbranch= regnext((regnode *)nextbranch);
14842 PerlIO_printf(Perl_debug_log, "\n");
14845 if (last && next > last)
14850 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
14851 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
14852 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
14854 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
14856 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
14858 else if ( op == PLUS || op == STAR) {
14859 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
14861 else if (PL_regkind[(U8)op] == ANYOF) {
14862 /* arglen 1 + class block */
14863 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
14864 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
14865 node = NEXTOPER(node);
14867 else if (PL_regkind[(U8)op] == EXACT) {
14868 /* Literal string, where present. */
14869 node += NODE_SZ_STR(node) - 1;
14870 node = NEXTOPER(node);
14873 node = NEXTOPER(node);
14874 node += regarglen[(U8)op];
14876 if (op == CURLYX || op == OPEN)
14880 #ifdef DEBUG_DUMPUNTIL
14881 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
14886 #endif /* DEBUGGING */
14890 * c-indentation-style: bsd
14891 * c-basic-offset: 4
14892 * indent-tabs-mode: nil
14895 * ex: set ts=8 sts=4 sw=4 et: