5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
97 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
100 #define STATIC static
104 struct RExC_state_t {
105 U32 flags; /* RXf_* are we folding, multilining? */
106 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
107 char *precomp; /* uncompiled string. */
108 REGEXP *rx_sv; /* The SV that is the regexp. */
109 regexp *rx; /* perl core regexp structure */
110 regexp_internal *rxi; /* internal data for regexp object
112 char *start; /* Start of input for compile */
113 char *end; /* End of input for compile */
114 char *parse; /* Input-scan pointer. */
115 SSize_t whilem_seen; /* number of WHILEM in this expr */
116 regnode *emit_start; /* Start of emitted-code area */
117 regnode *emit_bound; /* First regnode outside of the
119 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
120 implies compiling, so don't emit */
121 regnode_ssc emit_dummy; /* placeholder for emit to point to;
122 large enough for the largest
123 non-EXACTish node, so can use it as
125 I32 naughty; /* How bad is this pattern? */
126 I32 sawback; /* Did we see \1, ...? */
128 SSize_t size; /* Code size. */
129 I32 npar; /* Capture buffer count, (OPEN) plus
130 one. ("par" 0 is the whole
132 I32 nestroot; /* root parens we are in - used by
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
146 HV *paren_names; /* Paren names */
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
150 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
152 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
156 I32 override_recoding;
157 I32 in_multi_char_class;
158 struct reg_code_block *code_blocks; /* positions of literal (?{})
160 int num_code_blocks; /* size of code_blocks[] */
161 int code_index; /* next code_blocks[] slot */
162 SSize_t maxlen; /* mininum possible number of chars in string to match */
163 #ifdef ADD_TO_REGEXEC
164 char *starttry; /* -Dr: where regtry was called. */
165 #define RExC_starttry (pRExC_state->starttry)
167 SV *runtime_code_qr; /* qr with the runtime code blocks */
169 const char *lastparse;
171 AV *paren_name_list; /* idx -> name */
172 #define RExC_lastparse (pRExC_state->lastparse)
173 #define RExC_lastnum (pRExC_state->lastnum)
174 #define RExC_paren_name_list (pRExC_state->paren_name_list)
178 #define RExC_flags (pRExC_state->flags)
179 #define RExC_pm_flags (pRExC_state->pm_flags)
180 #define RExC_precomp (pRExC_state->precomp)
181 #define RExC_rx_sv (pRExC_state->rx_sv)
182 #define RExC_rx (pRExC_state->rx)
183 #define RExC_rxi (pRExC_state->rxi)
184 #define RExC_start (pRExC_state->start)
185 #define RExC_end (pRExC_state->end)
186 #define RExC_parse (pRExC_state->parse)
187 #define RExC_whilem_seen (pRExC_state->whilem_seen)
188 #ifdef RE_TRACK_PATTERN_OFFSETS
189 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
192 #define RExC_emit (pRExC_state->emit)
193 #define RExC_emit_dummy (pRExC_state->emit_dummy)
194 #define RExC_emit_start (pRExC_state->emit_start)
195 #define RExC_emit_bound (pRExC_state->emit_bound)
196 #define RExC_naughty (pRExC_state->naughty)
197 #define RExC_sawback (pRExC_state->sawback)
198 #define RExC_seen (pRExC_state->seen)
199 #define RExC_size (pRExC_state->size)
200 #define RExC_maxlen (pRExC_state->maxlen)
201 #define RExC_npar (pRExC_state->npar)
202 #define RExC_nestroot (pRExC_state->nestroot)
203 #define RExC_extralen (pRExC_state->extralen)
204 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
205 #define RExC_utf8 (pRExC_state->utf8)
206 #define RExC_uni_semantics (pRExC_state->uni_semantics)
207 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
208 #define RExC_open_parens (pRExC_state->open_parens)
209 #define RExC_close_parens (pRExC_state->close_parens)
210 #define RExC_opend (pRExC_state->opend)
211 #define RExC_paren_names (pRExC_state->paren_names)
212 #define RExC_recurse (pRExC_state->recurse)
213 #define RExC_recurse_count (pRExC_state->recurse_count)
214 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
215 #define RExC_study_chunk_recursed_bytes \
216 (pRExC_state->study_chunk_recursed_bytes)
217 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
218 #define RExC_contains_locale (pRExC_state->contains_locale)
219 #define RExC_contains_i (pRExC_state->contains_i)
220 #define RExC_override_recoding (pRExC_state->override_recoding)
221 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
224 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
225 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
226 ((*s) == '{' && regcurly(s, FALSE)))
229 * Flags to be passed up and down.
231 #define WORST 0 /* Worst case. */
232 #define HASWIDTH 0x01 /* Known to match non-null strings. */
234 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
235 * character. (There needs to be a case: in the switch statement in regexec.c
236 * for any node marked SIMPLE.) Note that this is not the same thing as
239 #define SPSTART 0x04 /* Starts with * or + */
240 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
241 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
242 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
244 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
246 /* whether trie related optimizations are enabled */
247 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
248 #define TRIE_STUDY_OPT
249 #define FULL_TRIE_STUDY
255 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
256 #define PBITVAL(paren) (1 << ((paren) & 7))
257 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
258 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
259 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
261 #define REQUIRE_UTF8 STMT_START { \
263 *flagp = RESTART_UTF8; \
268 /* This converts the named class defined in regcomp.h to its equivalent class
269 * number defined in handy.h. */
270 #define namedclass_to_classnum(class) ((int) ((class) / 2))
271 #define classnum_to_namedclass(classnum) ((classnum) * 2)
273 #define _invlist_union_complement_2nd(a, b, output) \
274 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
275 #define _invlist_intersection_complement_2nd(a, b, output) \
276 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
278 /* About scan_data_t.
280 During optimisation we recurse through the regexp program performing
281 various inplace (keyhole style) optimisations. In addition study_chunk
282 and scan_commit populate this data structure with information about
283 what strings MUST appear in the pattern. We look for the longest
284 string that must appear at a fixed location, and we look for the
285 longest string that may appear at a floating location. So for instance
290 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
291 strings (because they follow a .* construct). study_chunk will identify
292 both FOO and BAR as being the longest fixed and floating strings respectively.
294 The strings can be composites, for instance
298 will result in a composite fixed substring 'foo'.
300 For each string some basic information is maintained:
302 - offset or min_offset
303 This is the position the string must appear at, or not before.
304 It also implicitly (when combined with minlenp) tells us how many
305 characters must match before the string we are searching for.
306 Likewise when combined with minlenp and the length of the string it
307 tells us how many characters must appear after the string we have
311 Only used for floating strings. This is the rightmost point that
312 the string can appear at. If set to SSize_t_MAX it indicates that the
313 string can occur infinitely far to the right.
316 A pointer to the minimum number of characters of the pattern that the
317 string was found inside. This is important as in the case of positive
318 lookahead or positive lookbehind we can have multiple patterns
323 The minimum length of the pattern overall is 3, the minimum length
324 of the lookahead part is 3, but the minimum length of the part that
325 will actually match is 1. So 'FOO's minimum length is 3, but the
326 minimum length for the F is 1. This is important as the minimum length
327 is used to determine offsets in front of and behind the string being
328 looked for. Since strings can be composites this is the length of the
329 pattern at the time it was committed with a scan_commit. Note that
330 the length is calculated by study_chunk, so that the minimum lengths
331 are not known until the full pattern has been compiled, thus the
332 pointer to the value.
336 In the case of lookbehind the string being searched for can be
337 offset past the start point of the final matching string.
338 If this value was just blithely removed from the min_offset it would
339 invalidate some of the calculations for how many chars must match
340 before or after (as they are derived from min_offset and minlen and
341 the length of the string being searched for).
342 When the final pattern is compiled and the data is moved from the
343 scan_data_t structure into the regexp structure the information
344 about lookbehind is factored in, with the information that would
345 have been lost precalculated in the end_shift field for the
348 The fields pos_min and pos_delta are used to store the minimum offset
349 and the delta to the maximum offset at the current point in the pattern.
353 typedef struct scan_data_t {
354 /*I32 len_min; unused */
355 /*I32 len_delta; unused */
359 SSize_t last_end; /* min value, <0 unless valid. */
360 SSize_t last_start_min;
361 SSize_t last_start_max;
362 SV **longest; /* Either &l_fixed, or &l_float. */
363 SV *longest_fixed; /* longest fixed string found in pattern */
364 SSize_t offset_fixed; /* offset where it starts */
365 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
366 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
367 SV *longest_float; /* longest floating string found in pattern */
368 SSize_t offset_float_min; /* earliest point in string it can appear */
369 SSize_t offset_float_max; /* latest point in string it can appear */
370 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
371 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
374 SSize_t *last_closep;
375 regnode_ssc *start_class;
378 /* The below is perhaps overboard, but this allows us to save a test at the
379 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
380 * and 'a' differ by a single bit; the same with the upper and lower case of
381 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
382 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
383 * then inverts it to form a mask, with just a single 0, in the bit position
384 * where the upper- and lowercase differ. XXX There are about 40 other
385 * instances in the Perl core where this micro-optimization could be used.
386 * Should decide if maintenance cost is worse, before changing those
388 * Returns a boolean as to whether or not 'v' is either a lowercase or
389 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
390 * compile-time constant, the generated code is better than some optimizing
391 * compilers figure out, amounting to a mask and test. The results are
392 * meaningless if 'c' is not one of [A-Za-z] */
393 #define isARG2_lower_or_UPPER_ARG1(c, v) \
394 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
397 * Forward declarations for pregcomp()'s friends.
400 static const scan_data_t zero_scan_data =
401 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
403 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
404 #define SF_BEFORE_SEOL 0x0001
405 #define SF_BEFORE_MEOL 0x0002
406 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
407 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
409 #define SF_FIX_SHIFT_EOL (+2)
410 #define SF_FL_SHIFT_EOL (+4)
412 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
413 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
415 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
416 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
417 #define SF_IS_INF 0x0040
418 #define SF_HAS_PAR 0x0080
419 #define SF_IN_PAR 0x0100
420 #define SF_HAS_EVAL 0x0200
421 #define SCF_DO_SUBSTR 0x0400
422 #define SCF_DO_STCLASS_AND 0x0800
423 #define SCF_DO_STCLASS_OR 0x1000
424 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
425 #define SCF_WHILEM_VISITED_POS 0x2000
427 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
428 #define SCF_SEEN_ACCEPT 0x8000
429 #define SCF_TRIE_DOING_RESTUDY 0x10000
431 #define UTF cBOOL(RExC_utf8)
433 /* The enums for all these are ordered so things work out correctly */
434 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
435 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
436 == REGEX_DEPENDS_CHARSET)
437 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
438 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
439 >= REGEX_UNICODE_CHARSET)
440 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
441 == REGEX_ASCII_RESTRICTED_CHARSET)
442 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
443 >= REGEX_ASCII_RESTRICTED_CHARSET)
444 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
445 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
447 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
449 /* For programs that want to be strictly Unicode compatible by dying if any
450 * attempt is made to match a non-Unicode code point against a Unicode
452 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
454 #define OOB_NAMEDCLASS -1
456 /* There is no code point that is out-of-bounds, so this is problematic. But
457 * its only current use is to initialize a variable that is always set before
459 #define OOB_UNICODE 0xDEADBEEF
461 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
462 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
465 /* length of regex to show in messages that don't mark a position within */
466 #define RegexLengthToShowInErrorMessages 127
469 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
470 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
471 * op/pragma/warn/regcomp.
473 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
474 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
476 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
477 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
479 #define REPORT_LOCATION_ARGS(offset) \
480 UTF8fARG(UTF, offset, RExC_precomp), \
481 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
484 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
485 * arg. Show regex, up to a maximum length. If it's too long, chop and add
488 #define _FAIL(code) STMT_START { \
489 const char *ellipses = ""; \
490 IV len = RExC_end - RExC_precomp; \
493 SAVEFREESV(RExC_rx_sv); \
494 if (len > RegexLengthToShowInErrorMessages) { \
495 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
496 len = RegexLengthToShowInErrorMessages - 10; \
502 #define FAIL(msg) _FAIL( \
503 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
504 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
506 #define FAIL2(msg,arg) _FAIL( \
507 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
508 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
511 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
513 #define Simple_vFAIL(m) STMT_START { \
514 const IV offset = RExC_parse - RExC_precomp; \
515 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
516 m, REPORT_LOCATION_ARGS(offset)); \
520 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
522 #define vFAIL(m) STMT_START { \
524 SAVEFREESV(RExC_rx_sv); \
529 * Like Simple_vFAIL(), but accepts two arguments.
531 #define Simple_vFAIL2(m,a1) STMT_START { \
532 const IV offset = RExC_parse - RExC_precomp; \
533 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
534 REPORT_LOCATION_ARGS(offset)); \
538 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
540 #define vFAIL2(m,a1) STMT_START { \
542 SAVEFREESV(RExC_rx_sv); \
543 Simple_vFAIL2(m, a1); \
548 * Like Simple_vFAIL(), but accepts three arguments.
550 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
551 const IV offset = RExC_parse - RExC_precomp; \
552 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
553 REPORT_LOCATION_ARGS(offset)); \
557 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
559 #define vFAIL3(m,a1,a2) STMT_START { \
561 SAVEFREESV(RExC_rx_sv); \
562 Simple_vFAIL3(m, a1, a2); \
566 * Like Simple_vFAIL(), but accepts four arguments.
568 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
569 const IV offset = RExC_parse - RExC_precomp; \
570 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
571 REPORT_LOCATION_ARGS(offset)); \
574 #define vFAIL4(m,a1,a2,a3) STMT_START { \
576 SAVEFREESV(RExC_rx_sv); \
577 Simple_vFAIL4(m, a1, a2, a3); \
580 /* A specialized version of vFAIL2 that works with UTF8f */
581 #define vFAIL2utf8f(m, a1) STMT_START { \
582 const IV offset = RExC_parse - RExC_precomp; \
584 SAVEFREESV(RExC_rx_sv); \
585 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
586 REPORT_LOCATION_ARGS(offset)); \
590 /* m is not necessarily a "literal string", in this macro */
591 #define reg_warn_non_literal_string(loc, m) STMT_START { \
592 const IV offset = loc - RExC_precomp; \
593 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
594 m, REPORT_LOCATION_ARGS(offset)); \
597 #define ckWARNreg(loc,m) STMT_START { \
598 const IV offset = loc - RExC_precomp; \
599 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
600 REPORT_LOCATION_ARGS(offset)); \
603 #define vWARN_dep(loc, m) STMT_START { \
604 const IV offset = loc - RExC_precomp; \
605 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
606 REPORT_LOCATION_ARGS(offset)); \
609 #define ckWARNdep(loc,m) STMT_START { \
610 const IV offset = loc - RExC_precomp; \
611 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
613 REPORT_LOCATION_ARGS(offset)); \
616 #define ckWARNregdep(loc,m) STMT_START { \
617 const IV offset = loc - RExC_precomp; \
618 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
620 REPORT_LOCATION_ARGS(offset)); \
623 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
624 const IV offset = loc - RExC_precomp; \
625 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
627 a1, REPORT_LOCATION_ARGS(offset)); \
630 #define ckWARN2reg(loc, m, a1) STMT_START { \
631 const IV offset = loc - RExC_precomp; \
632 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
633 a1, REPORT_LOCATION_ARGS(offset)); \
636 #define vWARN3(loc, m, a1, a2) STMT_START { \
637 const IV offset = loc - RExC_precomp; \
638 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
639 a1, a2, REPORT_LOCATION_ARGS(offset)); \
642 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
643 const IV offset = loc - RExC_precomp; \
644 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
645 a1, a2, REPORT_LOCATION_ARGS(offset)); \
648 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
649 const IV offset = loc - RExC_precomp; \
650 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
651 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
654 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
655 const IV offset = loc - RExC_precomp; \
656 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
657 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
660 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
661 const IV offset = loc - RExC_precomp; \
662 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
663 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
667 /* Allow for side effects in s */
668 #define REGC(c,s) STMT_START { \
669 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
672 /* Macros for recording node offsets. 20001227 mjd@plover.com
673 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
674 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
675 * Element 0 holds the number n.
676 * Position is 1 indexed.
678 #ifndef RE_TRACK_PATTERN_OFFSETS
679 #define Set_Node_Offset_To_R(node,byte)
680 #define Set_Node_Offset(node,byte)
681 #define Set_Cur_Node_Offset
682 #define Set_Node_Length_To_R(node,len)
683 #define Set_Node_Length(node,len)
684 #define Set_Node_Cur_Length(node,start)
685 #define Node_Offset(n)
686 #define Node_Length(n)
687 #define Set_Node_Offset_Length(node,offset,len)
688 #define ProgLen(ri) ri->u.proglen
689 #define SetProgLen(ri,x) ri->u.proglen = x
691 #define ProgLen(ri) ri->u.offsets[0]
692 #define SetProgLen(ri,x) ri->u.offsets[0] = x
693 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
695 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
696 __LINE__, (int)(node), (int)(byte))); \
698 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
701 RExC_offsets[2*(node)-1] = (byte); \
706 #define Set_Node_Offset(node,byte) \
707 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
708 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
710 #define Set_Node_Length_To_R(node,len) STMT_START { \
712 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
713 __LINE__, (int)(node), (int)(len))); \
715 Perl_croak(aTHX_ "value of node is %d in Length macro", \
718 RExC_offsets[2*(node)] = (len); \
723 #define Set_Node_Length(node,len) \
724 Set_Node_Length_To_R((node)-RExC_emit_start, len)
725 #define Set_Node_Cur_Length(node, start) \
726 Set_Node_Length(node, RExC_parse - start)
728 /* Get offsets and lengths */
729 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
730 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
732 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
733 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
734 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
738 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
739 #define EXPERIMENTAL_INPLACESCAN
740 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
742 #define DEBUG_RExC_seen() \
743 DEBUG_OPTIMISE_MORE_r({ \
744 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
746 if (RExC_seen & REG_ZERO_LEN_SEEN) \
747 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
749 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
750 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
752 if (RExC_seen & REG_GPOS_SEEN) \
753 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
755 if (RExC_seen & REG_CANY_SEEN) \
756 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
758 if (RExC_seen & REG_RECURSE_SEEN) \
759 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
761 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
762 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
764 if (RExC_seen & REG_VERBARG_SEEN) \
765 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
767 if (RExC_seen & REG_CUTGROUP_SEEN) \
768 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
770 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
771 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
773 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
774 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
776 if (RExC_seen & REG_GOSTART_SEEN) \
777 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
779 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
780 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
782 PerlIO_printf(Perl_debug_log,"\n"); \
785 #define DEBUG_STUDYDATA(str,data,depth) \
786 DEBUG_OPTIMISE_MORE_r(if(data){ \
787 PerlIO_printf(Perl_debug_log, \
788 "%*s" str "Pos:%"IVdf"/%"IVdf \
789 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
790 (int)(depth)*2, "", \
791 (IV)((data)->pos_min), \
792 (IV)((data)->pos_delta), \
793 (UV)((data)->flags), \
794 (IV)((data)->whilem_c), \
795 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
796 is_inf ? "INF " : "" \
798 if ((data)->last_found) \
799 PerlIO_printf(Perl_debug_log, \
800 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
801 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
802 SvPVX_const((data)->last_found), \
803 (IV)((data)->last_end), \
804 (IV)((data)->last_start_min), \
805 (IV)((data)->last_start_max), \
806 ((data)->longest && \
807 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
808 SvPVX_const((data)->longest_fixed), \
809 (IV)((data)->offset_fixed), \
810 ((data)->longest && \
811 (data)->longest==&((data)->longest_float)) ? "*" : "", \
812 SvPVX_const((data)->longest_float), \
813 (IV)((data)->offset_float_min), \
814 (IV)((data)->offset_float_max) \
816 PerlIO_printf(Perl_debug_log,"\n"); \
819 /* Mark that we cannot extend a found fixed substring at this point.
820 Update the longest found anchored substring and the longest found
821 floating substrings if needed. */
824 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
825 SSize_t *minlenp, int is_inf)
827 const STRLEN l = CHR_SVLEN(data->last_found);
828 const STRLEN old_l = CHR_SVLEN(*data->longest);
829 GET_RE_DEBUG_FLAGS_DECL;
831 PERL_ARGS_ASSERT_SCAN_COMMIT;
833 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
834 SvSetMagicSV(*data->longest, data->last_found);
835 if (*data->longest == data->longest_fixed) {
836 data->offset_fixed = l ? data->last_start_min : data->pos_min;
837 if (data->flags & SF_BEFORE_EOL)
839 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
841 data->flags &= ~SF_FIX_BEFORE_EOL;
842 data->minlen_fixed=minlenp;
843 data->lookbehind_fixed=0;
845 else { /* *data->longest == data->longest_float */
846 data->offset_float_min = l ? data->last_start_min : data->pos_min;
847 data->offset_float_max = (l
848 ? data->last_start_max
849 : (data->pos_delta == SSize_t_MAX
851 : data->pos_min + data->pos_delta));
853 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
854 data->offset_float_max = SSize_t_MAX;
855 if (data->flags & SF_BEFORE_EOL)
857 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
859 data->flags &= ~SF_FL_BEFORE_EOL;
860 data->minlen_float=minlenp;
861 data->lookbehind_float=0;
864 SvCUR_set(data->last_found, 0);
866 SV * const sv = data->last_found;
867 if (SvUTF8(sv) && SvMAGICAL(sv)) {
868 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
874 data->flags &= ~SF_BEFORE_EOL;
875 DEBUG_STUDYDATA("commit: ",data,0);
878 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
879 * list that describes which code points it matches */
882 S_ssc_anything(pTHX_ regnode_ssc *ssc)
884 /* Set the SSC 'ssc' to match an empty string or any code point */
886 PERL_ARGS_ASSERT_SSC_ANYTHING;
888 assert(is_ANYOF_SYNTHETIC(ssc));
890 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
891 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
892 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
896 S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
898 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
899 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
900 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
901 * in any way, so there's no point in using it */
906 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
908 assert(is_ANYOF_SYNTHETIC(ssc));
910 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
914 /* See if the list consists solely of the range 0 - Infinity */
915 invlist_iterinit(ssc->invlist);
916 ret = invlist_iternext(ssc->invlist, &start, &end)
920 invlist_iterfinish(ssc->invlist);
926 /* If e.g., both \w and \W are set, matches everything */
927 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
929 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
930 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
940 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
942 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
943 * string, any code point, or any posix class under locale */
945 PERL_ARGS_ASSERT_SSC_INIT;
947 Zero(ssc, 1, regnode_ssc);
948 set_ANYOF_SYNTHETIC(ssc);
949 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
952 /* If any portion of the regex is to operate under locale rules,
953 * initialization includes it. The reason this isn't done for all regexes
954 * is that the optimizer was written under the assumption that locale was
955 * all-or-nothing. Given the complexity and lack of documentation in the
956 * optimizer, and that there are inadequate test cases for locale, many
957 * parts of it may not work properly, it is safest to avoid locale unless
959 if (RExC_contains_locale) {
960 ANYOF_POSIXL_SETALL(ssc);
963 ANYOF_POSIXL_ZERO(ssc);
968 S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
969 const regnode_ssc *ssc)
971 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
972 * to the list of code points matched, and locale posix classes; hence does
973 * not check its flags) */
978 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
980 assert(is_ANYOF_SYNTHETIC(ssc));
982 invlist_iterinit(ssc->invlist);
983 ret = invlist_iternext(ssc->invlist, &start, &end)
987 invlist_iterfinish(ssc->invlist);
993 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1001 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1002 const regnode_charclass* const node)
1004 /* Returns a mortal inversion list defining which code points are matched
1005 * by 'node', which is of type ANYOF. Handles complementing the result if
1006 * appropriate. If some code points aren't knowable at this time, the
1007 * returned list must, and will, contain every code point that is a
1010 SV* invlist = sv_2mortal(_new_invlist(0));
1011 SV* only_utf8_locale_invlist = NULL;
1013 const U32 n = ARG(node);
1014 bool new_node_has_latin1 = FALSE;
1016 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1018 /* Look at the data structure created by S_set_ANYOF_arg() */
1019 if (n != ANYOF_NONBITMAP_EMPTY) {
1020 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1021 AV * const av = MUTABLE_AV(SvRV(rv));
1022 SV **const ary = AvARRAY(av);
1023 assert(RExC_rxi->data->what[n] == 's');
1025 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1026 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1028 else if (ary[0] && ary[0] != &PL_sv_undef) {
1030 /* Here, no compile-time swash, and there are things that won't be
1031 * known until runtime -- we have to assume it could be anything */
1032 return _add_range_to_invlist(invlist, 0, UV_MAX);
1034 else if (ary[3] && ary[3] != &PL_sv_undef) {
1036 /* Here no compile-time swash, and no run-time only data. Use the
1037 * node's inversion list */
1038 invlist = sv_2mortal(invlist_clone(ary[3]));
1041 /* Get the code points valid only under UTF-8 locales */
1042 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1043 && ary[2] && ary[2] != &PL_sv_undef)
1045 only_utf8_locale_invlist = ary[2];
1049 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1050 * inversion list for the others, but if there are code points that should
1051 * match only conditionally on the target string being UTF-8, those are
1052 * placed in the inversion list, and not the bitmap. Since there are
1053 * circumstances under which they could match, they are included in the
1054 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1055 * here, so that when we invert below, the end result actually does include
1056 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1057 * before we add the unconditionally matched code points */
1058 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1059 _invlist_intersection_complement_2nd(invlist,
1064 /* Add in the points from the bit map */
1065 for (i = 0; i < 256; i++) {
1066 if (ANYOF_BITMAP_TEST(node, i)) {
1067 invlist = add_cp_to_invlist(invlist, i);
1068 new_node_has_latin1 = TRUE;
1072 /* If this can match all upper Latin1 code points, have to add them
1074 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1075 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1078 /* Similarly for these */
1079 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1080 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1083 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1084 _invlist_invert(invlist);
1086 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1088 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1089 * locale. We can skip this if there are no 0-255 at all. */
1090 _invlist_union(invlist, PL_Latin1, &invlist);
1093 /* Similarly add the UTF-8 locale possible matches. These have to be
1094 * deferred until after the non-UTF-8 locale ones are taken care of just
1095 * above, or it leads to wrong results under ANYOF_INVERT */
1096 if (only_utf8_locale_invlist) {
1097 _invlist_union_maybe_complement_2nd(invlist,
1098 only_utf8_locale_invlist,
1099 ANYOF_FLAGS(node) & ANYOF_INVERT,
1106 /* These two functions currently do the exact same thing */
1107 #define ssc_init_zero ssc_init
1109 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1110 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1112 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1113 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1114 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1117 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1118 const regnode_charclass *and_with)
1120 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1121 * another SSC or a regular ANYOF class. Can create false positives. */
1126 PERL_ARGS_ASSERT_SSC_AND;
1128 assert(is_ANYOF_SYNTHETIC(ssc));
1130 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1131 * the code point inversion list and just the relevant flags */
1132 if (is_ANYOF_SYNTHETIC(and_with)) {
1133 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1134 anded_flags = ANYOF_FLAGS(and_with);
1136 /* XXX This is a kludge around what appears to be deficiencies in the
1137 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1138 * there are paths through the optimizer where it doesn't get weeded
1139 * out when it should. And if we don't make some extra provision for
1140 * it like the code just below, it doesn't get added when it should.
1141 * This solution is to add it only when AND'ing, which is here, and
1142 * only when what is being AND'ed is the pristine, original node
1143 * matching anything. Thus it is like adding it to ssc_anything() but
1144 * only when the result is to be AND'ed. Probably the same solution
1145 * could be adopted for the same problem we have with /l matching,
1146 * which is solved differently in S_ssc_init(), and that would lead to
1147 * fewer false positives than that solution has. But if this solution
1148 * creates bugs, the consequences are only that a warning isn't raised
1149 * that should be; while the consequences for having /l bugs is
1150 * incorrect matches */
1151 if (ssc_is_anything((regnode_ssc *)and_with)) {
1152 anded_flags |= ANYOF_WARN_SUPER;
1156 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1157 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1160 ANYOF_FLAGS(ssc) &= anded_flags;
1162 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1163 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1164 * 'and_with' may be inverted. When not inverted, we have the situation of
1166 * (C1 | P1) & (C2 | P2)
1167 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1168 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1169 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1170 * <= ((C1 & C2) | P1 | P2)
1171 * Alternatively, the last few steps could be:
1172 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1173 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1174 * <= (C1 | C2 | (P1 & P2))
1175 * We favor the second approach if either P1 or P2 is non-empty. This is
1176 * because these components are a barrier to doing optimizations, as what
1177 * they match cannot be known until the moment of matching as they are
1178 * dependent on the current locale, 'AND"ing them likely will reduce or
1180 * But we can do better if we know that C1,P1 are in their initial state (a
1181 * frequent occurrence), each matching everything:
1182 * (<everything>) & (C2 | P2) = C2 | P2
1183 * Similarly, if C2,P2 are in their initial state (again a frequent
1184 * occurrence), the result is a no-op
1185 * (C1 | P1) & (<everything>) = C1 | P1
1188 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1189 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1190 * <= (C1 & ~C2) | (P1 & ~P2)
1193 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1194 && ! is_ANYOF_SYNTHETIC(and_with))
1198 ssc_intersection(ssc,
1200 FALSE /* Has already been inverted */
1203 /* If either P1 or P2 is empty, the intersection will be also; can skip
1205 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1206 ANYOF_POSIXL_ZERO(ssc);
1208 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1210 /* Note that the Posix class component P from 'and_with' actually
1212 * P = Pa | Pb | ... | Pn
1213 * where each component is one posix class, such as in [\w\s].
1215 * ~P = ~(Pa | Pb | ... | Pn)
1216 * = ~Pa & ~Pb & ... & ~Pn
1217 * <= ~Pa | ~Pb | ... | ~Pn
1218 * The last is something we can easily calculate, but unfortunately
1219 * is likely to have many false positives. We could do better
1220 * in some (but certainly not all) instances if two classes in
1221 * P have known relationships. For example
1222 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1224 * :lower: & :print: = :lower:
1225 * And similarly for classes that must be disjoint. For example,
1226 * since \s and \w can have no elements in common based on rules in
1227 * the POSIX standard,
1228 * \w & ^\S = nothing
1229 * Unfortunately, some vendor locales do not meet the Posix
1230 * standard, in particular almost everything by Microsoft.
1231 * The loop below just changes e.g., \w into \W and vice versa */
1233 regnode_charclass_posixl temp;
1234 int add = 1; /* To calculate the index of the complement */
1236 ANYOF_POSIXL_ZERO(&temp);
1237 for (i = 0; i < ANYOF_MAX; i++) {
1239 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1240 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1242 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1243 ANYOF_POSIXL_SET(&temp, i + add);
1245 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1247 ANYOF_POSIXL_AND(&temp, ssc);
1249 } /* else ssc already has no posixes */
1250 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1251 in its initial state */
1252 else if (! is_ANYOF_SYNTHETIC(and_with)
1253 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1255 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1256 * copy it over 'ssc' */
1257 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1258 if (is_ANYOF_SYNTHETIC(and_with)) {
1259 StructCopy(and_with, ssc, regnode_ssc);
1262 ssc->invlist = anded_cp_list;
1263 ANYOF_POSIXL_ZERO(ssc);
1264 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1265 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1269 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1270 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1272 /* One or the other of P1, P2 is non-empty. */
1273 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1274 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1276 ssc_union(ssc, anded_cp_list, FALSE);
1278 else { /* P1 = P2 = empty */
1279 ssc_intersection(ssc, anded_cp_list, FALSE);
1285 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1286 const regnode_charclass *or_with)
1288 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1289 * another SSC or a regular ANYOF class. Can create false positives if
1290 * 'or_with' is to be inverted. */
1295 PERL_ARGS_ASSERT_SSC_OR;
1297 assert(is_ANYOF_SYNTHETIC(ssc));
1299 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1300 * the code point inversion list and just the relevant flags */
1301 if (is_ANYOF_SYNTHETIC(or_with)) {
1302 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1303 ored_flags = ANYOF_FLAGS(or_with);
1306 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1307 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1310 ANYOF_FLAGS(ssc) |= ored_flags;
1312 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1313 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1314 * 'or_with' may be inverted. When not inverted, we have the simple
1315 * situation of computing:
1316 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1317 * If P1|P2 yields a situation with both a class and its complement are
1318 * set, like having both \w and \W, this matches all code points, and we
1319 * can delete these from the P component of the ssc going forward. XXX We
1320 * might be able to delete all the P components, but I (khw) am not certain
1321 * about this, and it is better to be safe.
1324 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1325 * <= (C1 | P1) | ~C2
1326 * <= (C1 | ~C2) | P1
1327 * (which results in actually simpler code than the non-inverted case)
1330 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1331 && ! is_ANYOF_SYNTHETIC(or_with))
1333 /* We ignore P2, leaving P1 going forward */
1334 } /* else Not inverted */
1335 else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1336 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1337 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1339 for (i = 0; i < ANYOF_MAX; i += 2) {
1340 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1342 ssc_match_all_cp(ssc);
1343 ANYOF_POSIXL_CLEAR(ssc, i);
1344 ANYOF_POSIXL_CLEAR(ssc, i+1);
1352 FALSE /* Already has been inverted */
1356 PERL_STATIC_INLINE void
1357 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1359 PERL_ARGS_ASSERT_SSC_UNION;
1361 assert(is_ANYOF_SYNTHETIC(ssc));
1363 _invlist_union_maybe_complement_2nd(ssc->invlist,
1369 PERL_STATIC_INLINE void
1370 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1372 const bool invert2nd)
1374 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1376 assert(is_ANYOF_SYNTHETIC(ssc));
1378 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1384 PERL_STATIC_INLINE void
1385 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1387 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1389 assert(is_ANYOF_SYNTHETIC(ssc));
1391 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1394 PERL_STATIC_INLINE void
1395 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1397 /* AND just the single code point 'cp' into the SSC 'ssc' */
1399 SV* cp_list = _new_invlist(2);
1401 PERL_ARGS_ASSERT_SSC_CP_AND;
1403 assert(is_ANYOF_SYNTHETIC(ssc));
1405 cp_list = add_cp_to_invlist(cp_list, cp);
1406 ssc_intersection(ssc, cp_list,
1407 FALSE /* Not inverted */
1409 SvREFCNT_dec_NN(cp_list);
1412 PERL_STATIC_INLINE void
1413 S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1415 /* Set the SSC 'ssc' to not match any locale things */
1417 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1419 assert(is_ANYOF_SYNTHETIC(ssc));
1421 ANYOF_POSIXL_ZERO(ssc);
1422 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1426 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1428 /* The inversion list in the SSC is marked mortal; now we need a more
1429 * permanent copy, which is stored the same way that is done in a regular
1430 * ANYOF node, with the first 256 code points in a bit map */
1432 SV* invlist = invlist_clone(ssc->invlist);
1434 PERL_ARGS_ASSERT_SSC_FINALIZE;
1436 assert(is_ANYOF_SYNTHETIC(ssc));
1438 /* The code in this file assumes that all but these flags aren't relevant
1439 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440 * time we reach here */
1441 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1443 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1445 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1446 NULL, NULL, NULL, FALSE);
1448 /* Make sure is clone-safe */
1449 ssc->invlist = NULL;
1451 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1452 ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1455 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1458 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1459 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1460 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1461 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1462 ? (TRIE_LIST_CUR( idx ) - 1) \
1468 dump_trie(trie,widecharmap,revcharmap)
1469 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1470 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1472 These routines dump out a trie in a somewhat readable format.
1473 The _interim_ variants are used for debugging the interim
1474 tables that are used to generate the final compressed
1475 representation which is what dump_trie expects.
1477 Part of the reason for their existence is to provide a form
1478 of documentation as to how the different representations function.
1483 Dumps the final compressed table form of the trie to Perl_debug_log.
1484 Used for debugging make_trie().
1488 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1489 AV *revcharmap, U32 depth)
1492 SV *sv=sv_newmortal();
1493 int colwidth= widecharmap ? 6 : 4;
1495 GET_RE_DEBUG_FLAGS_DECL;
1497 PERL_ARGS_ASSERT_DUMP_TRIE;
1499 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1500 (int)depth * 2 + 2,"",
1501 "Match","Base","Ofs" );
1503 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1504 SV ** const tmp = av_fetch( revcharmap, state, 0);
1506 PerlIO_printf( Perl_debug_log, "%*s",
1508 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1509 PL_colors[0], PL_colors[1],
1510 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1511 PERL_PV_ESCAPE_FIRSTCHAR
1516 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1517 (int)depth * 2 + 2,"");
1519 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1520 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1521 PerlIO_printf( Perl_debug_log, "\n");
1523 for( state = 1 ; state < trie->statecount ; state++ ) {
1524 const U32 base = trie->states[ state ].trans.base;
1526 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1527 (int)depth * 2 + 2,"", (UV)state);
1529 if ( trie->states[ state ].wordnum ) {
1530 PerlIO_printf( Perl_debug_log, " W%4X",
1531 trie->states[ state ].wordnum );
1533 PerlIO_printf( Perl_debug_log, "%6s", "" );
1536 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1541 while( ( base + ofs < trie->uniquecharcount ) ||
1542 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1543 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1547 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1549 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1550 if ( ( base + ofs >= trie->uniquecharcount )
1551 && ( base + ofs - trie->uniquecharcount
1553 && trie->trans[ base + ofs
1554 - trie->uniquecharcount ].check == state )
1556 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1558 (UV)trie->trans[ base + ofs
1559 - trie->uniquecharcount ].next );
1561 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1565 PerlIO_printf( Perl_debug_log, "]");
1568 PerlIO_printf( Perl_debug_log, "\n" );
1570 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1572 for (word=1; word <= trie->wordcount; word++) {
1573 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1574 (int)word, (int)(trie->wordinfo[word].prev),
1575 (int)(trie->wordinfo[word].len));
1577 PerlIO_printf(Perl_debug_log, "\n" );
1580 Dumps a fully constructed but uncompressed trie in list form.
1581 List tries normally only are used for construction when the number of
1582 possible chars (trie->uniquecharcount) is very high.
1583 Used for debugging make_trie().
1586 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1587 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1591 SV *sv=sv_newmortal();
1592 int colwidth= widecharmap ? 6 : 4;
1593 GET_RE_DEBUG_FLAGS_DECL;
1595 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1597 /* print out the table precompression. */
1598 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1599 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1600 "------:-----+-----------------\n" );
1602 for( state=1 ; state < next_alloc ; state ++ ) {
1605 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1606 (int)depth * 2 + 2,"", (UV)state );
1607 if ( ! trie->states[ state ].wordnum ) {
1608 PerlIO_printf( Perl_debug_log, "%5s| ","");
1610 PerlIO_printf( Perl_debug_log, "W%4x| ",
1611 trie->states[ state ].wordnum
1614 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1615 SV ** const tmp = av_fetch( revcharmap,
1616 TRIE_LIST_ITEM(state,charid).forid, 0);
1618 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1620 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1622 PL_colors[0], PL_colors[1],
1623 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1624 | PERL_PV_ESCAPE_FIRSTCHAR
1626 TRIE_LIST_ITEM(state,charid).forid,
1627 (UV)TRIE_LIST_ITEM(state,charid).newstate
1630 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1631 (int)((depth * 2) + 14), "");
1634 PerlIO_printf( Perl_debug_log, "\n");
1639 Dumps a fully constructed but uncompressed trie in table form.
1640 This is the normal DFA style state transition table, with a few
1641 twists to facilitate compression later.
1642 Used for debugging make_trie().
1645 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1646 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1651 SV *sv=sv_newmortal();
1652 int colwidth= widecharmap ? 6 : 4;
1653 GET_RE_DEBUG_FLAGS_DECL;
1655 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1658 print out the table precompression so that we can do a visual check
1659 that they are identical.
1662 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1664 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1665 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1667 PerlIO_printf( Perl_debug_log, "%*s",
1669 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1670 PL_colors[0], PL_colors[1],
1671 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1672 PERL_PV_ESCAPE_FIRSTCHAR
1678 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1680 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1681 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1684 PerlIO_printf( Perl_debug_log, "\n" );
1686 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1688 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1689 (int)depth * 2 + 2,"",
1690 (UV)TRIE_NODENUM( state ) );
1692 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1693 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1695 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1697 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1699 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1700 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1701 (UV)trie->trans[ state ].check );
1703 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1704 (UV)trie->trans[ state ].check,
1705 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1713 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1714 startbranch: the first branch in the whole branch sequence
1715 first : start branch of sequence of branch-exact nodes.
1716 May be the same as startbranch
1717 last : Thing following the last branch.
1718 May be the same as tail.
1719 tail : item following the branch sequence
1720 count : words in the sequence
1721 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1722 depth : indent depth
1724 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1726 A trie is an N'ary tree where the branches are determined by digital
1727 decomposition of the key. IE, at the root node you look up the 1st character and
1728 follow that branch repeat until you find the end of the branches. Nodes can be
1729 marked as "accepting" meaning they represent a complete word. Eg:
1733 would convert into the following structure. Numbers represent states, letters
1734 following numbers represent valid transitions on the letter from that state, if
1735 the number is in square brackets it represents an accepting state, otherwise it
1736 will be in parenthesis.
1738 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1742 (1) +-i->(6)-+-s->[7]
1744 +-s->(3)-+-h->(4)-+-e->[5]
1746 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1748 This shows that when matching against the string 'hers' we will begin at state 1
1749 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1750 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1751 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1752 single traverse. We store a mapping from accepting to state to which word was
1753 matched, and then when we have multiple possibilities we try to complete the
1754 rest of the regex in the order in which they occured in the alternation.
1756 The only prior NFA like behaviour that would be changed by the TRIE support is
1757 the silent ignoring of duplicate alternations which are of the form:
1759 / (DUPE|DUPE) X? (?{ ... }) Y /x
1761 Thus EVAL blocks following a trie may be called a different number of times with
1762 and without the optimisation. With the optimisations dupes will be silently
1763 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1764 the following demonstrates:
1766 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1768 which prints out 'word' three times, but
1770 'words'=~/(word|word|word)(?{ print $1 })S/
1772 which doesnt print it out at all. This is due to other optimisations kicking in.
1774 Example of what happens on a structural level:
1776 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1778 1: CURLYM[1] {1,32767}(18)
1789 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1790 and should turn into:
1792 1: CURLYM[1] {1,32767}(18)
1794 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1802 Cases where tail != last would be like /(?foo|bar)baz/:
1812 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1813 and would end up looking like:
1816 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1823 d = uvchr_to_utf8_flags(d, uv, 0);
1825 is the recommended Unicode-aware way of saying
1830 #define TRIE_STORE_REVCHAR(val) \
1833 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1834 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1835 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1836 SvCUR_set(zlopp, kapow - flrbbbbb); \
1839 av_push(revcharmap, zlopp); \
1841 char ooooff = (char)val; \
1842 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1846 /* This gets the next character from the input, folding it if not already
1848 #define TRIE_READ_CHAR STMT_START { \
1851 /* if it is UTF then it is either already folded, or does not need \
1853 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1855 else if (folder == PL_fold_latin1) { \
1856 /* This folder implies Unicode rules, which in the range expressible \
1857 * by not UTF is the lower case, with the two exceptions, one of \
1858 * which should have been taken care of before calling this */ \
1859 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1860 uvc = toLOWER_L1(*uc); \
1861 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1864 /* raw data, will be folded later if needed */ \
1872 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1873 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1874 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1875 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1877 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1878 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1879 TRIE_LIST_CUR( state )++; \
1882 #define TRIE_LIST_NEW(state) STMT_START { \
1883 Newxz( trie->states[ state ].trans.list, \
1884 4, reg_trie_trans_le ); \
1885 TRIE_LIST_CUR( state ) = 1; \
1886 TRIE_LIST_LEN( state ) = 4; \
1889 #define TRIE_HANDLE_WORD(state) STMT_START { \
1890 U16 dupe= trie->states[ state ].wordnum; \
1891 regnode * const noper_next = regnext( noper ); \
1894 /* store the word for dumping */ \
1896 if (OP(noper) != NOTHING) \
1897 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1899 tmp = newSVpvn_utf8( "", 0, UTF ); \
1900 av_push( trie_words, tmp ); \
1904 trie->wordinfo[curword].prev = 0; \
1905 trie->wordinfo[curword].len = wordlen; \
1906 trie->wordinfo[curword].accept = state; \
1908 if ( noper_next < tail ) { \
1910 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1912 trie->jump[curword] = (U16)(noper_next - convert); \
1914 jumper = noper_next; \
1916 nextbranch= regnext(cur); \
1920 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1921 /* chain, so that when the bits of chain are later */\
1922 /* linked together, the dups appear in the chain */\
1923 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1924 trie->wordinfo[dupe].prev = curword; \
1926 /* we haven't inserted this word yet. */ \
1927 trie->states[ state ].wordnum = curword; \
1932 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1933 ( ( base + charid >= ucharcount \
1934 && base + charid < ubound \
1935 && state == trie->trans[ base - ucharcount + charid ].check \
1936 && trie->trans[ base - ucharcount + charid ].next ) \
1937 ? trie->trans[ base - ucharcount + charid ].next \
1938 : ( state==1 ? special : 0 ) \
1942 #define MADE_JUMP_TRIE 2
1943 #define MADE_EXACT_TRIE 4
1946 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1947 regnode *first, regnode *last, regnode *tail,
1948 U32 word_count, U32 flags, U32 depth)
1951 /* first pass, loop through and scan words */
1952 reg_trie_data *trie;
1953 HV *widecharmap = NULL;
1954 AV *revcharmap = newAV();
1960 regnode *jumper = NULL;
1961 regnode *nextbranch = NULL;
1962 regnode *convert = NULL;
1963 U32 *prev_states; /* temp array mapping each state to previous one */
1964 /* we just use folder as a flag in utf8 */
1965 const U8 * folder = NULL;
1968 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969 AV *trie_words = NULL;
1970 /* along with revcharmap, this only used during construction but both are
1971 * useful during debugging so we store them in the struct when debugging.
1974 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975 STRLEN trie_charcount=0;
1977 SV *re_trie_maxbuff;
1978 GET_RE_DEBUG_FLAGS_DECL;
1980 PERL_ARGS_ASSERT_MAKE_TRIE;
1982 PERL_UNUSED_ARG(depth);
1989 case EXACTFU: folder = PL_fold_latin1; break;
1990 case EXACTF: folder = PL_fold; break;
1991 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1994 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1996 trie->startstate = 1;
1997 trie->wordcount = word_count;
1998 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2001 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2006 trie_words = newAV();
2009 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010 if (!SvIOK(re_trie_maxbuff)) {
2011 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2013 DEBUG_TRIE_COMPILE_r({
2014 PerlIO_printf( Perl_debug_log,
2015 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2016 (int)depth * 2 + 2, "",
2017 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2018 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2021 /* Find the node we are going to overwrite */
2022 if ( first == startbranch && OP( last ) != BRANCH ) {
2023 /* whole branch chain */
2026 /* branch sub-chain */
2027 convert = NEXTOPER( first );
2030 /* -- First loop and Setup --
2032 We first traverse the branches and scan each word to determine if it
2033 contains widechars, and how many unique chars there are, this is
2034 important as we have to build a table with at least as many columns as we
2037 We use an array of integers to represent the character codes 0..255
2038 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2039 the native representation of the character value as the key and IV's for
2042 *TODO* If we keep track of how many times each character is used we can
2043 remap the columns so that the table compression later on is more
2044 efficient in terms of memory by ensuring the most common value is in the
2045 middle and the least common are on the outside. IMO this would be better
2046 than a most to least common mapping as theres a decent chance the most
2047 common letter will share a node with the least common, meaning the node
2048 will not be compressible. With a middle is most common approach the worst
2049 case is when we have the least common nodes twice.
2053 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2054 regnode *noper = NEXTOPER( cur );
2055 const U8 *uc = (U8*)STRING( noper );
2056 const U8 *e = uc + STR_LEN( noper );
2058 U32 wordlen = 0; /* required init */
2059 STRLEN minchars = 0;
2060 STRLEN maxchars = 0;
2061 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2064 if (OP(noper) == NOTHING) {
2065 regnode *noper_next= regnext(noper);
2066 if (noper_next != tail && OP(noper_next) == flags) {
2068 uc= (U8*)STRING(noper);
2069 e= uc + STR_LEN(noper);
2070 trie->minlen= STR_LEN(noper);
2077 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2078 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2079 regardless of encoding */
2080 if (OP( noper ) == EXACTFU_SS) {
2081 /* false positives are ok, so just set this */
2082 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2085 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2087 TRIE_CHARCOUNT(trie)++;
2090 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2091 * is in effect. Under /i, this character can match itself, or
2092 * anything that folds to it. If not under /i, it can match just
2093 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2094 * all fold to k, and all are single characters. But some folds
2095 * expand to more than one character, so for example LATIN SMALL
2096 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2097 * the string beginning at 'uc' is 'ffi', it could be matched by
2098 * three characters, or just by the one ligature character. (It
2099 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2100 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2101 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2102 * match.) The trie needs to know the minimum and maximum number
2103 * of characters that could match so that it can use size alone to
2104 * quickly reject many match attempts. The max is simple: it is
2105 * the number of folded characters in this branch (since a fold is
2106 * never shorter than what folds to it. */
2110 /* And the min is equal to the max if not under /i (indicated by
2111 * 'folder' being NULL), or there are no multi-character folds. If
2112 * there is a multi-character fold, the min is incremented just
2113 * once, for the character that folds to the sequence. Each
2114 * character in the sequence needs to be added to the list below of
2115 * characters in the trie, but we count only the first towards the
2116 * min number of characters needed. This is done through the
2117 * variable 'foldlen', which is returned by the macros that look
2118 * for these sequences as the number of bytes the sequence
2119 * occupies. Each time through the loop, we decrement 'foldlen' by
2120 * how many bytes the current char occupies. Only when it reaches
2121 * 0 do we increment 'minchars' or look for another multi-character
2123 if (folder == NULL) {
2126 else if (foldlen > 0) {
2127 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2132 /* See if *uc is the beginning of a multi-character fold. If
2133 * so, we decrement the length remaining to look at, to account
2134 * for the current character this iteration. (We can use 'uc'
2135 * instead of the fold returned by TRIE_READ_CHAR because for
2136 * non-UTF, the latin1_safe macro is smart enough to account
2137 * for all the unfolded characters, and because for UTF, the
2138 * string will already have been folded earlier in the
2139 * compilation process */
2141 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2142 foldlen -= UTF8SKIP(uc);
2145 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2150 /* The current character (and any potential folds) should be added
2151 * to the possible matching characters for this position in this
2155 U8 folded= folder[ (U8) uvc ];
2156 if ( !trie->charmap[ folded ] ) {
2157 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2158 TRIE_STORE_REVCHAR( folded );
2161 if ( !trie->charmap[ uvc ] ) {
2162 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2163 TRIE_STORE_REVCHAR( uvc );
2166 /* store the codepoint in the bitmap, and its folded
2168 TRIE_BITMAP_SET(trie, uvc);
2170 /* store the folded codepoint */
2171 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2174 /* store first byte of utf8 representation of
2175 variant codepoints */
2176 if (! UVCHR_IS_INVARIANT(uvc)) {
2177 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2180 set_bit = 0; /* We've done our bit :-) */
2184 /* XXX We could come up with the list of code points that fold
2185 * to this using PL_utf8_foldclosures, except not for
2186 * multi-char folds, as there may be multiple combinations
2187 * there that could work, which needs to wait until runtime to
2188 * resolve (The comment about LIGATURE FFI above is such an
2193 widecharmap = newHV();
2195 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2198 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2200 if ( !SvTRUE( *svpp ) ) {
2201 sv_setiv( *svpp, ++trie->uniquecharcount );
2202 TRIE_STORE_REVCHAR(uvc);
2205 } /* end loop through characters in this branch of the trie */
2207 /* We take the min and max for this branch and combine to find the min
2208 * and max for all branches processed so far */
2209 if( cur == first ) {
2210 trie->minlen = minchars;
2211 trie->maxlen = maxchars;
2212 } else if (minchars < trie->minlen) {
2213 trie->minlen = minchars;
2214 } else if (maxchars > trie->maxlen) {
2215 trie->maxlen = maxchars;
2217 } /* end first pass */
2218 DEBUG_TRIE_COMPILE_r(
2219 PerlIO_printf( Perl_debug_log,
2220 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2221 (int)depth * 2 + 2,"",
2222 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2223 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2224 (int)trie->minlen, (int)trie->maxlen )
2228 We now know what we are dealing with in terms of unique chars and
2229 string sizes so we can calculate how much memory a naive
2230 representation using a flat table will take. If it's over a reasonable
2231 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2232 conservative but potentially much slower representation using an array
2235 At the end we convert both representations into the same compressed
2236 form that will be used in regexec.c for matching with. The latter
2237 is a form that cannot be used to construct with but has memory
2238 properties similar to the list form and access properties similar
2239 to the table form making it both suitable for fast searches and
2240 small enough that its feasable to store for the duration of a program.
2242 See the comment in the code where the compressed table is produced
2243 inplace from the flat tabe representation for an explanation of how
2244 the compression works.
2249 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2252 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2253 > SvIV(re_trie_maxbuff) )
2256 Second Pass -- Array Of Lists Representation
2258 Each state will be represented by a list of charid:state records
2259 (reg_trie_trans_le) the first such element holds the CUR and LEN
2260 points of the allocated array. (See defines above).
2262 We build the initial structure using the lists, and then convert
2263 it into the compressed table form which allows faster lookups
2264 (but cant be modified once converted).
2267 STRLEN transcount = 1;
2269 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2270 "%*sCompiling trie using list compiler\n",
2271 (int)depth * 2 + 2, ""));
2273 trie->states = (reg_trie_state *)
2274 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2275 sizeof(reg_trie_state) );
2279 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2281 regnode *noper = NEXTOPER( cur );
2282 U8 *uc = (U8*)STRING( noper );
2283 const U8 *e = uc + STR_LEN( noper );
2284 U32 state = 1; /* required init */
2285 U16 charid = 0; /* sanity init */
2286 U32 wordlen = 0; /* required init */
2288 if (OP(noper) == NOTHING) {
2289 regnode *noper_next= regnext(noper);
2290 if (noper_next != tail && OP(noper_next) == flags) {
2292 uc= (U8*)STRING(noper);
2293 e= uc + STR_LEN(noper);
2297 if (OP(noper) != NOTHING) {
2298 for ( ; uc < e ; uc += len ) {
2303 charid = trie->charmap[ uvc ];
2305 SV** const svpp = hv_fetch( widecharmap,
2312 charid=(U16)SvIV( *svpp );
2315 /* charid is now 0 if we dont know the char read, or
2316 * nonzero if we do */
2323 if ( !trie->states[ state ].trans.list ) {
2324 TRIE_LIST_NEW( state );
2327 check <= TRIE_LIST_USED( state );
2330 if ( TRIE_LIST_ITEM( state, check ).forid
2333 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2338 newstate = next_alloc++;
2339 prev_states[newstate] = state;
2340 TRIE_LIST_PUSH( state, charid, newstate );
2345 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2349 TRIE_HANDLE_WORD(state);
2351 } /* end second pass */
2353 /* next alloc is the NEXT state to be allocated */
2354 trie->statecount = next_alloc;
2355 trie->states = (reg_trie_state *)
2356 PerlMemShared_realloc( trie->states,
2358 * sizeof(reg_trie_state) );
2360 /* and now dump it out before we compress it */
2361 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2362 revcharmap, next_alloc,
2366 trie->trans = (reg_trie_trans *)
2367 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2374 for( state=1 ; state < next_alloc ; state ++ ) {
2378 DEBUG_TRIE_COMPILE_MORE_r(
2379 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2383 if (trie->states[state].trans.list) {
2384 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2388 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2389 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2390 if ( forid < minid ) {
2392 } else if ( forid > maxid ) {
2396 if ( transcount < tp + maxid - minid + 1) {
2398 trie->trans = (reg_trie_trans *)
2399 PerlMemShared_realloc( trie->trans,
2401 * sizeof(reg_trie_trans) );
2402 Zero( trie->trans + (transcount / 2),
2406 base = trie->uniquecharcount + tp - minid;
2407 if ( maxid == minid ) {
2409 for ( ; zp < tp ; zp++ ) {
2410 if ( ! trie->trans[ zp ].next ) {
2411 base = trie->uniquecharcount + zp - minid;
2412 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2414 trie->trans[ zp ].check = state;
2420 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2422 trie->trans[ tp ].check = state;
2427 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2428 const U32 tid = base
2429 - trie->uniquecharcount
2430 + TRIE_LIST_ITEM( state, idx ).forid;
2431 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2433 trie->trans[ tid ].check = state;
2435 tp += ( maxid - minid + 1 );
2437 Safefree(trie->states[ state ].trans.list);
2440 DEBUG_TRIE_COMPILE_MORE_r(
2441 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2444 trie->states[ state ].trans.base=base;
2446 trie->lasttrans = tp + 1;
2450 Second Pass -- Flat Table Representation.
2452 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2453 each. We know that we will need Charcount+1 trans at most to store
2454 the data (one row per char at worst case) So we preallocate both
2455 structures assuming worst case.
2457 We then construct the trie using only the .next slots of the entry
2460 We use the .check field of the first entry of the node temporarily
2461 to make compression both faster and easier by keeping track of how
2462 many non zero fields are in the node.
2464 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2467 There are two terms at use here: state as a TRIE_NODEIDX() which is
2468 a number representing the first entry of the node, and state as a
2469 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2470 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2471 if there are 2 entrys per node. eg:
2479 The table is internally in the right hand, idx form. However as we
2480 also have to deal with the states array which is indexed by nodenum
2481 we have to use TRIE_NODENUM() to convert.
2484 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2485 "%*sCompiling trie using table compiler\n",
2486 (int)depth * 2 + 2, ""));
2488 trie->trans = (reg_trie_trans *)
2489 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2490 * trie->uniquecharcount + 1,
2491 sizeof(reg_trie_trans) );
2492 trie->states = (reg_trie_state *)
2493 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2494 sizeof(reg_trie_state) );
2495 next_alloc = trie->uniquecharcount + 1;
2498 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2500 regnode *noper = NEXTOPER( cur );
2501 const U8 *uc = (U8*)STRING( noper );
2502 const U8 *e = uc + STR_LEN( noper );
2504 U32 state = 1; /* required init */
2506 U16 charid = 0; /* sanity init */
2507 U32 accept_state = 0; /* sanity init */
2509 U32 wordlen = 0; /* required init */
2511 if (OP(noper) == NOTHING) {
2512 regnode *noper_next= regnext(noper);
2513 if (noper_next != tail && OP(noper_next) == flags) {
2515 uc= (U8*)STRING(noper);
2516 e= uc + STR_LEN(noper);
2520 if ( OP(noper) != NOTHING ) {
2521 for ( ; uc < e ; uc += len ) {
2526 charid = trie->charmap[ uvc ];
2528 SV* const * const svpp = hv_fetch( widecharmap,
2532 charid = svpp ? (U16)SvIV(*svpp) : 0;
2536 if ( !trie->trans[ state + charid ].next ) {
2537 trie->trans[ state + charid ].next = next_alloc;
2538 trie->trans[ state ].check++;
2539 prev_states[TRIE_NODENUM(next_alloc)]
2540 = TRIE_NODENUM(state);
2541 next_alloc += trie->uniquecharcount;
2543 state = trie->trans[ state + charid ].next;
2545 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2547 /* charid is now 0 if we dont know the char read, or
2548 * nonzero if we do */
2551 accept_state = TRIE_NODENUM( state );
2552 TRIE_HANDLE_WORD(accept_state);
2554 } /* end second pass */
2556 /* and now dump it out before we compress it */
2557 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2559 next_alloc, depth+1));
2563 * Inplace compress the table.*
2565 For sparse data sets the table constructed by the trie algorithm will
2566 be mostly 0/FAIL transitions or to put it another way mostly empty.
2567 (Note that leaf nodes will not contain any transitions.)
2569 This algorithm compresses the tables by eliminating most such
2570 transitions, at the cost of a modest bit of extra work during lookup:
2572 - Each states[] entry contains a .base field which indicates the
2573 index in the state[] array wheres its transition data is stored.
2575 - If .base is 0 there are no valid transitions from that node.
2577 - If .base is nonzero then charid is added to it to find an entry in
2580 -If trans[states[state].base+charid].check!=state then the
2581 transition is taken to be a 0/Fail transition. Thus if there are fail
2582 transitions at the front of the node then the .base offset will point
2583 somewhere inside the previous nodes data (or maybe even into a node
2584 even earlier), but the .check field determines if the transition is
2588 The following process inplace converts the table to the compressed
2589 table: We first do not compress the root node 1,and mark all its
2590 .check pointers as 1 and set its .base pointer as 1 as well. This
2591 allows us to do a DFA construction from the compressed table later,
2592 and ensures that any .base pointers we calculate later are greater
2595 - We set 'pos' to indicate the first entry of the second node.
2597 - We then iterate over the columns of the node, finding the first and
2598 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2599 and set the .check pointers accordingly, and advance pos
2600 appropriately and repreat for the next node. Note that when we copy
2601 the next pointers we have to convert them from the original
2602 NODEIDX form to NODENUM form as the former is not valid post
2605 - If a node has no transitions used we mark its base as 0 and do not
2606 advance the pos pointer.
2608 - If a node only has one transition we use a second pointer into the
2609 structure to fill in allocated fail transitions from other states.
2610 This pointer is independent of the main pointer and scans forward
2611 looking for null transitions that are allocated to a state. When it
2612 finds one it writes the single transition into the "hole". If the
2613 pointer doesnt find one the single transition is appended as normal.
2615 - Once compressed we can Renew/realloc the structures to release the
2618 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2619 specifically Fig 3.47 and the associated pseudocode.
2623 const U32 laststate = TRIE_NODENUM( next_alloc );
2626 trie->statecount = laststate;
2628 for ( state = 1 ; state < laststate ; state++ ) {
2630 const U32 stateidx = TRIE_NODEIDX( state );
2631 const U32 o_used = trie->trans[ stateidx ].check;
2632 U32 used = trie->trans[ stateidx ].check;
2633 trie->trans[ stateidx ].check = 0;
2636 used && charid < trie->uniquecharcount;
2639 if ( flag || trie->trans[ stateidx + charid ].next ) {
2640 if ( trie->trans[ stateidx + charid ].next ) {
2642 for ( ; zp < pos ; zp++ ) {
2643 if ( ! trie->trans[ zp ].next ) {
2647 trie->states[ state ].trans.base
2649 + trie->uniquecharcount
2651 trie->trans[ zp ].next
2652 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2654 trie->trans[ zp ].check = state;
2655 if ( ++zp > pos ) pos = zp;
2662 trie->states[ state ].trans.base
2663 = pos + trie->uniquecharcount - charid ;
2665 trie->trans[ pos ].next
2666 = SAFE_TRIE_NODENUM(
2667 trie->trans[ stateidx + charid ].next );
2668 trie->trans[ pos ].check = state;
2673 trie->lasttrans = pos + 1;
2674 trie->states = (reg_trie_state *)
2675 PerlMemShared_realloc( trie->states, laststate
2676 * sizeof(reg_trie_state) );
2677 DEBUG_TRIE_COMPILE_MORE_r(
2678 PerlIO_printf( Perl_debug_log,
2679 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2680 (int)depth * 2 + 2,"",
2681 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2685 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2688 } /* end table compress */
2690 DEBUG_TRIE_COMPILE_MORE_r(
2691 PerlIO_printf(Perl_debug_log,
2692 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2693 (int)depth * 2 + 2, "",
2694 (UV)trie->statecount,
2695 (UV)trie->lasttrans)
2697 /* resize the trans array to remove unused space */
2698 trie->trans = (reg_trie_trans *)
2699 PerlMemShared_realloc( trie->trans, trie->lasttrans
2700 * sizeof(reg_trie_trans) );
2702 { /* Modify the program and insert the new TRIE node */
2703 U8 nodetype =(U8)(flags & 0xFF);
2707 regnode *optimize = NULL;
2708 #ifdef RE_TRACK_PATTERN_OFFSETS
2711 U32 mjd_nodelen = 0;
2712 #endif /* RE_TRACK_PATTERN_OFFSETS */
2713 #endif /* DEBUGGING */
2715 This means we convert either the first branch or the first Exact,
2716 depending on whether the thing following (in 'last') is a branch
2717 or not and whther first is the startbranch (ie is it a sub part of
2718 the alternation or is it the whole thing.)
2719 Assuming its a sub part we convert the EXACT otherwise we convert
2720 the whole branch sequence, including the first.
2722 /* Find the node we are going to overwrite */
2723 if ( first != startbranch || OP( last ) == BRANCH ) {
2724 /* branch sub-chain */
2725 NEXT_OFF( first ) = (U16)(last - first);
2726 #ifdef RE_TRACK_PATTERN_OFFSETS
2728 mjd_offset= Node_Offset((convert));
2729 mjd_nodelen= Node_Length((convert));
2732 /* whole branch chain */
2734 #ifdef RE_TRACK_PATTERN_OFFSETS
2737 const regnode *nop = NEXTOPER( convert );
2738 mjd_offset= Node_Offset((nop));
2739 mjd_nodelen= Node_Length((nop));
2743 PerlIO_printf(Perl_debug_log,
2744 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2745 (int)depth * 2 + 2, "",
2746 (UV)mjd_offset, (UV)mjd_nodelen)
2749 /* But first we check to see if there is a common prefix we can
2750 split out as an EXACT and put in front of the TRIE node. */
2751 trie->startstate= 1;
2752 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2754 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2758 const U32 base = trie->states[ state ].trans.base;
2760 if ( trie->states[state].wordnum )
2763 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2764 if ( ( base + ofs >= trie->uniquecharcount ) &&
2765 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2766 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2768 if ( ++count > 1 ) {
2769 SV **tmp = av_fetch( revcharmap, ofs, 0);
2770 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2771 if ( state == 1 ) break;
2773 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2775 PerlIO_printf(Perl_debug_log,
2776 "%*sNew Start State=%"UVuf" Class: [",
2777 (int)depth * 2 + 2, "",
2780 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2781 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2783 TRIE_BITMAP_SET(trie,*ch);
2785 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2787 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2791 TRIE_BITMAP_SET(trie,*ch);
2793 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2794 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2800 SV **tmp = av_fetch( revcharmap, idx, 0);
2802 char *ch = SvPV( *tmp, len );
2804 SV *sv=sv_newmortal();
2805 PerlIO_printf( Perl_debug_log,
2806 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2807 (int)depth * 2 + 2, "",
2809 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2810 PL_colors[0], PL_colors[1],
2811 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2812 PERL_PV_ESCAPE_FIRSTCHAR
2817 OP( convert ) = nodetype;
2818 str=STRING(convert);
2821 STR_LEN(convert) += len;
2827 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2832 trie->prefixlen = (state-1);
2834 regnode *n = convert+NODE_SZ_STR(convert);
2835 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2836 trie->startstate = state;
2837 trie->minlen -= (state - 1);
2838 trie->maxlen -= (state - 1);
2840 /* At least the UNICOS C compiler choked on this
2841 * being argument to DEBUG_r(), so let's just have
2844 #ifdef PERL_EXT_RE_BUILD
2850 regnode *fix = convert;
2851 U32 word = trie->wordcount;
2853 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2854 while( ++fix < n ) {
2855 Set_Node_Offset_Length(fix, 0, 0);
2858 SV ** const tmp = av_fetch( trie_words, word, 0 );
2860 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2861 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2863 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2871 NEXT_OFF(convert) = (U16)(tail - convert);
2872 DEBUG_r(optimize= n);
2878 if ( trie->maxlen ) {
2879 NEXT_OFF( convert ) = (U16)(tail - convert);
2880 ARG_SET( convert, data_slot );
2881 /* Store the offset to the first unabsorbed branch in
2882 jump[0], which is otherwise unused by the jump logic.
2883 We use this when dumping a trie and during optimisation. */
2885 trie->jump[0] = (U16)(nextbranch - convert);
2887 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2888 * and there is a bitmap
2889 * and the first "jump target" node we found leaves enough room
2890 * then convert the TRIE node into a TRIEC node, with the bitmap
2891 * embedded inline in the opcode - this is hypothetically faster.
2893 if ( !trie->states[trie->startstate].wordnum
2895 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2897 OP( convert ) = TRIEC;
2898 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2899 PerlMemShared_free(trie->bitmap);
2902 OP( convert ) = TRIE;
2904 /* store the type in the flags */
2905 convert->flags = nodetype;
2909 + regarglen[ OP( convert ) ];
2911 /* XXX We really should free up the resource in trie now,
2912 as we won't use them - (which resources?) dmq */
2914 /* needed for dumping*/
2915 DEBUG_r(if (optimize) {
2916 regnode *opt = convert;
2918 while ( ++opt < optimize) {
2919 Set_Node_Offset_Length(opt,0,0);
2922 Try to clean up some of the debris left after the
2925 while( optimize < jumper ) {
2926 mjd_nodelen += Node_Length((optimize));
2927 OP( optimize ) = OPTIMIZED;
2928 Set_Node_Offset_Length(optimize,0,0);
2931 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2933 } /* end node insert */
2934 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2936 /* Finish populating the prev field of the wordinfo array. Walk back
2937 * from each accept state until we find another accept state, and if
2938 * so, point the first word's .prev field at the second word. If the
2939 * second already has a .prev field set, stop now. This will be the
2940 * case either if we've already processed that word's accept state,
2941 * or that state had multiple words, and the overspill words were
2942 * already linked up earlier.
2949 for (word=1; word <= trie->wordcount; word++) {
2951 if (trie->wordinfo[word].prev)
2953 state = trie->wordinfo[word].accept;
2955 state = prev_states[state];
2958 prev = trie->states[state].wordnum;
2962 trie->wordinfo[word].prev = prev;
2964 Safefree(prev_states);
2968 /* and now dump out the compressed format */
2969 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2971 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2973 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2974 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2976 SvREFCNT_dec_NN(revcharmap);
2980 : trie->startstate>1
2986 S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2988 /* The Trie is constructed and compressed now so we can build a fail array if
2991 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2993 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2997 We find the fail state for each state in the trie, this state is the longest
2998 proper suffix of the current state's 'word' that is also a proper prefix of
2999 another word in our trie. State 1 represents the word '' and is thus the
3000 default fail state. This allows the DFA not to have to restart after its
3001 tried and failed a word at a given point, it simply continues as though it
3002 had been matching the other word in the first place.
3004 'abcdgu'=~/abcdefg|cdgu/
3005 When we get to 'd' we are still matching the first word, we would encounter
3006 'g' which would fail, which would bring us to the state representing 'd' in
3007 the second word where we would try 'g' and succeed, proceeding to match
3010 /* add a fail transition */
3011 const U32 trie_offset = ARG(source);
3012 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3014 const U32 ucharcount = trie->uniquecharcount;
3015 const U32 numstates = trie->statecount;
3016 const U32 ubound = trie->lasttrans + ucharcount;
3020 U32 base = trie->states[ 1 ].trans.base;
3023 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3024 GET_RE_DEBUG_FLAGS_DECL;
3026 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
3028 PERL_UNUSED_ARG(depth);
3032 ARG_SET( stclass, data_slot );
3033 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3034 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3035 aho->trie=trie_offset;
3036 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3037 Copy( trie->states, aho->states, numstates, reg_trie_state );
3038 Newxz( q, numstates, U32);
3039 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3042 /* initialize fail[0..1] to be 1 so that we always have
3043 a valid final fail state */
3044 fail[ 0 ] = fail[ 1 ] = 1;
3046 for ( charid = 0; charid < ucharcount ; charid++ ) {
3047 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3049 q[ q_write ] = newstate;
3050 /* set to point at the root */
3051 fail[ q[ q_write++ ] ]=1;
3054 while ( q_read < q_write) {
3055 const U32 cur = q[ q_read++ % numstates ];
3056 base = trie->states[ cur ].trans.base;
3058 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3059 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3061 U32 fail_state = cur;
3064 fail_state = fail[ fail_state ];
3065 fail_base = aho->states[ fail_state ].trans.base;
3066 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3068 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3069 fail[ ch_state ] = fail_state;
3070 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3072 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3074 q[ q_write++ % numstates] = ch_state;
3078 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3079 when we fail in state 1, this allows us to use the
3080 charclass scan to find a valid start char. This is based on the principle
3081 that theres a good chance the string being searched contains lots of stuff
3082 that cant be a start char.
3084 fail[ 0 ] = fail[ 1 ] = 0;
3085 DEBUG_TRIE_COMPILE_r({
3086 PerlIO_printf(Perl_debug_log,
3087 "%*sStclass Failtable (%"UVuf" states): 0",
3088 (int)(depth * 2), "", (UV)numstates
3090 for( q_read=1; q_read<numstates; q_read++ ) {
3091 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3093 PerlIO_printf(Perl_debug_log, "\n");
3096 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3100 #define DEBUG_PEEP(str,scan,depth) \
3101 DEBUG_OPTIMISE_r({if (scan){ \
3102 SV * const mysv=sv_newmortal(); \
3103 regnode *Next = regnext(scan); \
3104 regprop(RExC_rx, mysv, scan, NULL); \
3105 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3106 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3107 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3111 /* The below joins as many adjacent EXACTish nodes as possible into a single
3112 * one. The regop may be changed if the node(s) contain certain sequences that
3113 * require special handling. The joining is only done if:
3114 * 1) there is room in the current conglomerated node to entirely contain the
3116 * 2) they are the exact same node type
3118 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3119 * these get optimized out
3121 * If a node is to match under /i (folded), the number of characters it matches
3122 * can be different than its character length if it contains a multi-character
3123 * fold. *min_subtract is set to the total delta number of characters of the
3126 * And *unfolded_multi_char is set to indicate whether or not the node contains
3127 * an unfolded multi-char fold. This happens when whether the fold is valid or
3128 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3129 * SMALL LETTER SHARP S, as only if the target string being matched against
3130 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3131 * folding rules depend on the locale in force at runtime. (Multi-char folds
3132 * whose components are all above the Latin1 range are not run-time locale
3133 * dependent, and have already been folded by the time this function is
3136 * This is as good a place as any to discuss the design of handling these
3137 * multi-character fold sequences. It's been wrong in Perl for a very long
3138 * time. There are three code points in Unicode whose multi-character folds
3139 * were long ago discovered to mess things up. The previous designs for
3140 * dealing with these involved assigning a special node for them. This
3141 * approach doesn't always work, as evidenced by this example:
3142 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3143 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3144 * would match just the \xDF, it won't be able to handle the case where a
3145 * successful match would have to cross the node's boundary. The new approach
3146 * that hopefully generally solves the problem generates an EXACTFU_SS node
3147 * that is "sss" in this case.
3149 * It turns out that there are problems with all multi-character folds, and not
3150 * just these three. Now the code is general, for all such cases. The
3151 * approach taken is:
3152 * 1) This routine examines each EXACTFish node that could contain multi-
3153 * character folded sequences. Since a single character can fold into
3154 * such a sequence, the minimum match length for this node is less than
3155 * the number of characters in the node. This routine returns in
3156 * *min_subtract how many characters to subtract from the the actual
3157 * length of the string to get a real minimum match length; it is 0 if
3158 * there are no multi-char foldeds. This delta is used by the caller to
3159 * adjust the min length of the match, and the delta between min and max,
3160 * so that the optimizer doesn't reject these possibilities based on size
3162 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3163 * is used for an EXACTFU node that contains at least one "ss" sequence in
3164 * it. For non-UTF-8 patterns and strings, this is the only case where
3165 * there is a possible fold length change. That means that a regular
3166 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3167 * with length changes, and so can be processed faster. regexec.c takes
3168 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3169 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3170 * known until runtime). This saves effort in regex matching. However,
3171 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3172 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3173 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3174 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3175 * possibilities for the non-UTF8 patterns are quite simple, except for
3176 * the sharp s. All the ones that don't involve a UTF-8 target string are
3177 * members of a fold-pair, and arrays are set up for all of them so that
3178 * the other member of the pair can be found quickly. Code elsewhere in
3179 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3180 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3181 * described in the next item.
3182 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3183 * validity of the fold won't be known until runtime, and so must remain
3184 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3185 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3186 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3187 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3188 * The reason this is a problem is that the optimizer part of regexec.c
3189 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3190 * that a character in the pattern corresponds to at most a single
3191 * character in the target string. (And I do mean character, and not byte
3192 * here, unlike other parts of the documentation that have never been
3193 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3194 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3195 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3196 * nodes, violate the assumption, and they are the only instances where it
3197 * is violated. I'm reluctant to try to change the assumption, as the
3198 * code involved is impenetrable to me (khw), so instead the code here
3199 * punts. This routine examines EXACTFL nodes, and (when the pattern
3200 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3201 * boolean indicating whether or not the node contains such a fold. When
3202 * it is true, the caller sets a flag that later causes the optimizer in
3203 * this file to not set values for the floating and fixed string lengths,
3204 * and thus avoids the optimizer code in regexec.c that makes the invalid
3205 * assumption. Thus, there is no optimization based on string lengths for
3206 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3207 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3208 * assumption is wrong only in these cases is that all other non-UTF-8
3209 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3210 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3211 * EXACTF nodes because we don't know at compile time if it actually
3212 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3213 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3214 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3215 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3216 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3217 * string would require the pattern to be forced into UTF-8, the overhead
3218 * of which we want to avoid. Similarly the unfolded multi-char folds in
3219 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3222 * Similarly, the code that generates tries doesn't currently handle
3223 * not-already-folded multi-char folds, and it looks like a pain to change
3224 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3225 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3226 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3227 * using /iaa matching will be doing so almost entirely with ASCII
3228 * strings, so this should rarely be encountered in practice */
3230 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3231 if (PL_regkind[OP(scan)] == EXACT) \
3232 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3235 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3236 UV *min_subtract, bool *unfolded_multi_char,
3237 U32 flags,regnode *val, U32 depth)
3239 /* Merge several consecutive EXACTish nodes into one. */
3240 regnode *n = regnext(scan);
3242 regnode *next = scan + NODE_SZ_STR(scan);
3246 regnode *stop = scan;
3247 GET_RE_DEBUG_FLAGS_DECL;
3249 PERL_UNUSED_ARG(depth);
3252 PERL_ARGS_ASSERT_JOIN_EXACT;
3253 #ifndef EXPERIMENTAL_INPLACESCAN
3254 PERL_UNUSED_ARG(flags);
3255 PERL_UNUSED_ARG(val);
3257 DEBUG_PEEP("join",scan,depth);
3259 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3260 * EXACT ones that are mergeable to the current one. */
3262 && (PL_regkind[OP(n)] == NOTHING
3263 || (stringok && OP(n) == OP(scan)))
3265 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3268 if (OP(n) == TAIL || n > next)
3270 if (PL_regkind[OP(n)] == NOTHING) {
3271 DEBUG_PEEP("skip:",n,depth);
3272 NEXT_OFF(scan) += NEXT_OFF(n);
3273 next = n + NODE_STEP_REGNODE;
3280 else if (stringok) {
3281 const unsigned int oldl = STR_LEN(scan);
3282 regnode * const nnext = regnext(n);
3284 /* XXX I (khw) kind of doubt that this works on platforms (should
3285 * Perl ever run on one) where U8_MAX is above 255 because of lots
3286 * of other assumptions */
3287 /* Don't join if the sum can't fit into a single node */
3288 if (oldl + STR_LEN(n) > U8_MAX)
3291 DEBUG_PEEP("merg",n,depth);
3294 NEXT_OFF(scan) += NEXT_OFF(n);
3295 STR_LEN(scan) += STR_LEN(n);
3296 next = n + NODE_SZ_STR(n);
3297 /* Now we can overwrite *n : */
3298 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3306 #ifdef EXPERIMENTAL_INPLACESCAN
3307 if (flags && !NEXT_OFF(n)) {
3308 DEBUG_PEEP("atch", val, depth);
3309 if (reg_off_by_arg[OP(n)]) {
3310 ARG_SET(n, val - n);
3313 NEXT_OFF(n) = val - n;
3321 *unfolded_multi_char = FALSE;
3323 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3324 * can now analyze for sequences of problematic code points. (Prior to
3325 * this final joining, sequences could have been split over boundaries, and
3326 * hence missed). The sequences only happen in folding, hence for any
3327 * non-EXACT EXACTish node */
3328 if (OP(scan) != EXACT) {
3329 U8* s0 = (U8*) STRING(scan);
3331 U8* s_end = s0 + STR_LEN(scan);
3333 int total_count_delta = 0; /* Total delta number of characters that
3334 multi-char folds expand to */
3336 /* One pass is made over the node's string looking for all the
3337 * possibilities. To avoid some tests in the loop, there are two main
3338 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3343 if (OP(scan) == EXACTFL) {
3346 /* An EXACTFL node would already have been changed to another
3347 * node type unless there is at least one character in it that
3348 * is problematic; likely a character whose fold definition
3349 * won't be known until runtime, and so has yet to be folded.
3350 * For all but the UTF-8 locale, folds are 1-1 in length, but
3351 * to handle the UTF-8 case, we need to create a temporary
3352 * folded copy using UTF-8 locale rules in order to analyze it.
3353 * This is because our macros that look to see if a sequence is
3354 * a multi-char fold assume everything is folded (otherwise the
3355 * tests in those macros would be too complicated and slow).
3356 * Note that here, the non-problematic folds will have already
3357 * been done, so we can just copy such characters. We actually
3358 * don't completely fold the EXACTFL string. We skip the
3359 * unfolded multi-char folds, as that would just create work
3360 * below to figure out the size they already are */
3362 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3365 STRLEN s_len = UTF8SKIP(s);
3366 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3367 Copy(s, d, s_len, U8);
3370 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3371 *unfolded_multi_char = TRUE;
3372 Copy(s, d, s_len, U8);
3375 else if (isASCII(*s)) {
3376 *(d++) = toFOLD(*s);
3380 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3386 /* Point the remainder of the routine to look at our temporary
3390 } /* End of creating folded copy of EXACTFL string */
3392 /* Examine the string for a multi-character fold sequence. UTF-8
3393 * patterns have all characters pre-folded by the time this code is
3395 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3396 length sequence we are looking for is 2 */
3398 int count = 0; /* How many characters in a multi-char fold */
3399 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3400 if (! len) { /* Not a multi-char fold: get next char */
3405 /* Nodes with 'ss' require special handling, except for
3406 * EXACTFA-ish for which there is no multi-char fold to this */
3407 if (len == 2 && *s == 's' && *(s+1) == 's'
3408 && OP(scan) != EXACTFA
3409 && OP(scan) != EXACTFA_NO_TRIE)
3412 if (OP(scan) != EXACTFL) {
3413 OP(scan) = EXACTFU_SS;
3417 else { /* Here is a generic multi-char fold. */
3418 U8* multi_end = s + len;
3420 /* Count how many characters in it. In the case of /aa, no
3421 * folds which contain ASCII code points are allowed, so
3422 * check for those, and skip if found. */
3423 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3424 count = utf8_length(s, multi_end);
3428 while (s < multi_end) {
3431 goto next_iteration;
3441 /* The delta is how long the sequence is minus 1 (1 is how long
3442 * the character that folds to the sequence is) */
3443 total_count_delta += count - 1;
3447 /* We created a temporary folded copy of the string in EXACTFL
3448 * nodes. Therefore we need to be sure it doesn't go below zero,
3449 * as the real string could be shorter */
3450 if (OP(scan) == EXACTFL) {
3451 int total_chars = utf8_length((U8*) STRING(scan),
3452 (U8*) STRING(scan) + STR_LEN(scan));
3453 if (total_count_delta > total_chars) {
3454 total_count_delta = total_chars;
3458 *min_subtract += total_count_delta;
3461 else if (OP(scan) == EXACTFA) {
3463 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3464 * fold to the ASCII range (and there are no existing ones in the
3465 * upper latin1 range). But, as outlined in the comments preceding
3466 * this function, we need to flag any occurrences of the sharp s.
3467 * This character forbids trie formation (because of added
3470 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3471 OP(scan) = EXACTFA_NO_TRIE;
3472 *unfolded_multi_char = TRUE;
3481 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3482 * folds that are all Latin1. As explained in the comments
3483 * preceding this function, we look also for the sharp s in EXACTF
3484 * and EXACTFL nodes; it can be in the final position. Otherwise
3485 * we can stop looking 1 byte earlier because have to find at least
3486 * two characters for a multi-fold */
3487 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3492 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3493 if (! len) { /* Not a multi-char fold. */
3494 if (*s == LATIN_SMALL_LETTER_SHARP_S
3495 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3497 *unfolded_multi_char = TRUE;
3504 && isARG2_lower_or_UPPER_ARG1('s', *s)
3505 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3508 /* EXACTF nodes need to know that the minimum length
3509 * changed so that a sharp s in the string can match this
3510 * ss in the pattern, but they remain EXACTF nodes, as they
3511 * won't match this unless the target string is is UTF-8,
3512 * which we don't know until runtime. EXACTFL nodes can't
3513 * transform into EXACTFU nodes */
3514 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3515 OP(scan) = EXACTFU_SS;
3519 *min_subtract += len - 1;
3526 /* Allow dumping but overwriting the collection of skipped
3527 * ops and/or strings with fake optimized ops */
3528 n = scan + NODE_SZ_STR(scan);
3536 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3540 /* REx optimizer. Converts nodes into quicker variants "in place".
3541 Finds fixed substrings. */
3543 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3544 to the position after last scanned or to NULL. */
3546 #define INIT_AND_WITHP \
3547 assert(!and_withp); \
3548 Newx(and_withp,1, regnode_ssc); \
3549 SAVEFREEPV(and_withp)
3551 /* this is a chain of data about sub patterns we are processing that
3552 need to be handled separately/specially in study_chunk. Its so
3553 we can simulate recursion without losing state. */
3555 typedef struct scan_frame {
3556 regnode *last; /* last node to process in this frame */
3557 regnode *next; /* next node to process when last is reached */
3558 struct scan_frame *prev; /*previous frame*/
3559 U32 prev_recursed_depth;
3560 I32 stop; /* what stopparen do we use */
3565 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3566 SSize_t *minlenp, SSize_t *deltap,
3571 regnode_ssc *and_withp,
3572 U32 flags, U32 depth)
3573 /* scanp: Start here (read-write). */
3574 /* deltap: Write maxlen-minlen here. */
3575 /* last: Stop before this one. */
3576 /* data: string data about the pattern */
3577 /* stopparen: treat close N as END */
3578 /* recursed: which subroutines have we recursed into */
3579 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3582 /* There must be at least this number of characters to match */
3585 regnode *scan = *scanp, *next;
3587 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3588 int is_inf_internal = 0; /* The studied chunk is infinite */
3589 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3590 scan_data_t data_fake;
3591 SV *re_trie_maxbuff = NULL;
3592 regnode *first_non_open = scan;
3593 SSize_t stopmin = SSize_t_MAX;
3594 scan_frame *frame = NULL;
3595 GET_RE_DEBUG_FLAGS_DECL;
3597 PERL_ARGS_ASSERT_STUDY_CHUNK;
3600 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3603 while (first_non_open && OP(first_non_open) == OPEN)
3604 first_non_open=regnext(first_non_open);
3609 while ( scan && OP(scan) != END && scan < last ){
3610 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3611 node length to get a real minimum (because
3612 the folded version may be shorter) */
3613 bool unfolded_multi_char = FALSE;
3614 /* Peephole optimizer: */
3615 DEBUG_OPTIMISE_MORE_r(
3617 PerlIO_printf(Perl_debug_log,
3618 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3619 ((int) depth*2), "", (long)stopparen,
3620 (unsigned long)depth, (unsigned long)recursed_depth);
3621 if (recursed_depth) {
3624 for ( j = 0 ; j < recursed_depth ; j++ ) {
3625 PerlIO_printf(Perl_debug_log,"[");
3626 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3627 PerlIO_printf(Perl_debug_log,"%d",
3628 PAREN_TEST(RExC_study_chunk_recursed +
3629 (j * RExC_study_chunk_recursed_bytes), i)
3632 PerlIO_printf(Perl_debug_log,"]");
3635 PerlIO_printf(Perl_debug_log,"\n");
3638 DEBUG_STUDYDATA("Peep:", data, depth);
3639 DEBUG_PEEP("Peep", scan, depth);
3642 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3643 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3644 * by a different invocation of reg() -- Yves
3646 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3648 /* Follow the next-chain of the current node and optimize
3649 away all the NOTHINGs from it. */
3650 if (OP(scan) != CURLYX) {
3651 const int max = (reg_off_by_arg[OP(scan)]
3653 /* I32 may be smaller than U16 on CRAYs! */
3654 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3655 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3659 /* Skip NOTHING and LONGJMP. */
3660 while ((n = regnext(n))
3661 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3662 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3663 && off + noff < max)
3665 if (reg_off_by_arg[OP(scan)])
3668 NEXT_OFF(scan) = off;
3673 /* The principal pseudo-switch. Cannot be a switch, since we
3674 look into several different things. */
3675 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3676 || OP(scan) == IFTHEN) {
3677 next = regnext(scan);
3679 /* demq: the op(next)==code check is to see if we have
3680 * "branch-branch" AFAICT */
3682 if (OP(next) == code || code == IFTHEN) {
3683 /* NOTE - There is similar code to this block below for
3684 * handling TRIE nodes on a re-study. If you change stuff here
3685 * check there too. */
3686 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3688 regnode * const startbranch=scan;
3690 if (flags & SCF_DO_SUBSTR) {
3691 /* Cannot merge strings after this. */
3692 scan_commit(pRExC_state, data, minlenp, is_inf);
3695 if (flags & SCF_DO_STCLASS)
3696 ssc_init_zero(pRExC_state, &accum);
3698 while (OP(scan) == code) {
3699 SSize_t deltanext, minnext, fake;
3701 regnode_ssc this_class;
3704 data_fake.flags = 0;
3706 data_fake.whilem_c = data->whilem_c;
3707 data_fake.last_closep = data->last_closep;
3710 data_fake.last_closep = &fake;
3712 data_fake.pos_delta = delta;
3713 next = regnext(scan);
3714 scan = NEXTOPER(scan);
3716 scan = NEXTOPER(scan);
3717 if (flags & SCF_DO_STCLASS) {
3718 ssc_init(pRExC_state, &this_class);
3719 data_fake.start_class = &this_class;
3720 f = SCF_DO_STCLASS_AND;
3722 if (flags & SCF_WHILEM_VISITED_POS)
3723 f |= SCF_WHILEM_VISITED_POS;
3725 /* we suppose the run is continuous, last=next...*/
3726 minnext = study_chunk(pRExC_state, &scan, minlenp,
3727 &deltanext, next, &data_fake, stopparen,
3728 recursed_depth, NULL, f,depth+1);
3731 if (deltanext == SSize_t_MAX) {
3732 is_inf = is_inf_internal = 1;
3734 } else if (max1 < minnext + deltanext)
3735 max1 = minnext + deltanext;
3737 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3739 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3740 if ( stopmin > minnext)
3741 stopmin = min + min1;
3742 flags &= ~SCF_DO_SUBSTR;
3744 data->flags |= SCF_SEEN_ACCEPT;
3747 if (data_fake.flags & SF_HAS_EVAL)
3748 data->flags |= SF_HAS_EVAL;
3749 data->whilem_c = data_fake.whilem_c;
3751 if (flags & SCF_DO_STCLASS)
3752 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3754 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3756 if (flags & SCF_DO_SUBSTR) {
3757 data->pos_min += min1;
3758 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3759 data->pos_delta = SSize_t_MAX;
3761 data->pos_delta += max1 - min1;
3762 if (max1 != min1 || is_inf)
3763 data->longest = &(data->longest_float);
3766 if (delta == SSize_t_MAX
3767 || SSize_t_MAX - delta - (max1 - min1) < 0)
3768 delta = SSize_t_MAX;
3770 delta += max1 - min1;
3771 if (flags & SCF_DO_STCLASS_OR) {
3772 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3774 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3775 flags &= ~SCF_DO_STCLASS;
3778 else if (flags & SCF_DO_STCLASS_AND) {
3780 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3781 flags &= ~SCF_DO_STCLASS;
3784 /* Switch to OR mode: cache the old value of
3785 * data->start_class */
3787 StructCopy(data->start_class, and_withp, regnode_ssc);
3788 flags &= ~SCF_DO_STCLASS_AND;
3789 StructCopy(&accum, data->start_class, regnode_ssc);
3790 flags |= SCF_DO_STCLASS_OR;
3794 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3795 OP( startbranch ) == BRANCH )
3799 Assuming this was/is a branch we are dealing with: 'scan'
3800 now points at the item that follows the branch sequence,
3801 whatever it is. We now start at the beginning of the
3802 sequence and look for subsequences of
3808 which would be constructed from a pattern like
3811 If we can find such a subsequence we need to turn the first
3812 element into a trie and then add the subsequent branch exact
3813 strings to the trie.
3817 1. patterns where the whole set of branches can be
3820 2. patterns where only a subset can be converted.
3822 In case 1 we can replace the whole set with a single regop
3823 for the trie. In case 2 we need to keep the start and end
3826 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3827 becomes BRANCH TRIE; BRANCH X;
3829 There is an additional case, that being where there is a
3830 common prefix, which gets split out into an EXACT like node
3831 preceding the TRIE node.
3833 If x(1..n)==tail then we can do a simple trie, if not we make
3834 a "jump" trie, such that when we match the appropriate word
3835 we "jump" to the appropriate tail node. Essentially we turn
3836 a nested if into a case structure of sorts.
3841 if (!re_trie_maxbuff) {
3842 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3843 if (!SvIOK(re_trie_maxbuff))
3844 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3846 if ( SvIV(re_trie_maxbuff)>=0 ) {
3848 regnode *first = (regnode *)NULL;
3849 regnode *last = (regnode *)NULL;
3850 regnode *tail = scan;
3855 SV * const mysv = sv_newmortal(); /* for dumping */
3857 /* var tail is used because there may be a TAIL
3858 regop in the way. Ie, the exacts will point to the
3859 thing following the TAIL, but the last branch will
3860 point at the TAIL. So we advance tail. If we
3861 have nested (?:) we may have to move through several
3865 while ( OP( tail ) == TAIL ) {
3866 /* this is the TAIL generated by (?:) */
3867 tail = regnext( tail );
3871 DEBUG_TRIE_COMPILE_r({
3872 regprop(RExC_rx, mysv, tail, NULL);
3873 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3874 (int)depth * 2 + 2, "",
3875 "Looking for TRIE'able sequences. Tail node is: ",
3876 SvPV_nolen_const( mysv )
3882 Step through the branches
3883 cur represents each branch,
3884 noper is the first thing to be matched as part
3886 noper_next is the regnext() of that node.
3888 We normally handle a case like this
3889 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3890 support building with NOJUMPTRIE, which restricts
3891 the trie logic to structures like /FOO|BAR/.
3893 If noper is a trieable nodetype then the branch is
3894 a possible optimization target. If we are building
3895 under NOJUMPTRIE then we require that noper_next is
3896 the same as scan (our current position in the regex
3899 Once we have two or more consecutive such branches
3900 we can create a trie of the EXACT's contents and
3901 stitch it in place into the program.
3903 If the sequence represents all of the branches in
3904 the alternation we replace the entire thing with a
3907 Otherwise when it is a subsequence we need to
3908 stitch it in place and replace only the relevant
3909 branches. This means the first branch has to remain
3910 as it is used by the alternation logic, and its
3911 next pointer, and needs to be repointed at the item
3912 on the branch chain following the last branch we
3913 have optimized away.
3915 This could be either a BRANCH, in which case the
3916 subsequence is internal, or it could be the item
3917 following the branch sequence in which case the
3918 subsequence is at the end (which does not
3919 necessarily mean the first node is the start of the
3922 TRIE_TYPE(X) is a define which maps the optype to a
3926 ----------------+-----------
3930 EXACTFU_SS | EXACTFU
3935 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3936 ( EXACT == (X) ) ? EXACT : \
3937 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3938 ( EXACTFA == (X) ) ? EXACTFA : \
3941 /* dont use tail as the end marker for this traverse */
3942 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3943 regnode * const noper = NEXTOPER( cur );
3944 U8 noper_type = OP( noper );
3945 U8 noper_trietype = TRIE_TYPE( noper_type );
3946 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3947 regnode * const noper_next = regnext( noper );
3948 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3949 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3952 DEBUG_TRIE_COMPILE_r({
3953 regprop(RExC_rx, mysv, cur, NULL);
3954 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3955 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3957 regprop(RExC_rx, mysv, noper, NULL);
3958 PerlIO_printf( Perl_debug_log, " -> %s",
3959 SvPV_nolen_const(mysv));
3962 regprop(RExC_rx, mysv, noper_next, NULL);
3963 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3964 SvPV_nolen_const(mysv));
3966 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3967 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3968 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3972 /* Is noper a trieable nodetype that can be merged
3973 * with the current trie (if there is one)? */
3977 ( noper_trietype == NOTHING)
3978 || ( trietype == NOTHING )
3979 || ( trietype == noper_trietype )
3982 && noper_next == tail
3986 /* Handle mergable triable node Either we are
3987 * the first node in a new trieable sequence,
3988 * in which case we do some bookkeeping,
3989 * otherwise we update the end pointer. */
3992 if ( noper_trietype == NOTHING ) {
3993 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3994 regnode * const noper_next = regnext( noper );
3995 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
3996 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3999 if ( noper_next_trietype ) {
4000 trietype = noper_next_trietype;
4001 } else if (noper_next_type) {
4002 /* a NOTHING regop is 1 regop wide.
4003 * We need at least two for a trie
4004 * so we can't merge this in */
4008 trietype = noper_trietype;
4011 if ( trietype == NOTHING )
4012 trietype = noper_trietype;
4017 } /* end handle mergable triable node */
4019 /* handle unmergable node -
4020 * noper may either be a triable node which can
4021 * not be tried together with the current trie,
4022 * or a non triable node */
4024 /* If last is set and trietype is not
4025 * NOTHING then we have found at least two
4026 * triable branch sequences in a row of a
4027 * similar trietype so we can turn them
4028 * into a trie. If/when we allow NOTHING to
4029 * start a trie sequence this condition
4030 * will be required, and it isn't expensive
4031 * so we leave it in for now. */
4032 if ( trietype && trietype != NOTHING )
4033 make_trie( pRExC_state,
4034 startbranch, first, cur, tail,
4035 count, trietype, depth+1 );
4036 last = NULL; /* note: we clear/update
4037 first, trietype etc below,
4038 so we dont do it here */
4042 && noper_next == tail
4045 /* noper is triable, so we can start a new
4049 trietype = noper_trietype;
4051 /* if we already saw a first but the
4052 * current node is not triable then we have
4053 * to reset the first information. */
4058 } /* end handle unmergable node */
4059 } /* loop over branches */
4060 DEBUG_TRIE_COMPILE_r({
4061 regprop(RExC_rx, mysv, cur, NULL);
4062 PerlIO_printf( Perl_debug_log,
4063 "%*s- %s (%d) <SCAN FINISHED>\n",
4065 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4068 if ( last && trietype ) {
4069 if ( trietype != NOTHING ) {
4070 /* the last branch of the sequence was part of
4071 * a trie, so we have to construct it here
4072 * outside of the loop */
4073 made= make_trie( pRExC_state, startbranch,
4074 first, scan, tail, count,
4075 trietype, depth+1 );
4076 #ifdef TRIE_STUDY_OPT
4077 if ( ((made == MADE_EXACT_TRIE &&
4078 startbranch == first)
4079 || ( first_non_open == first )) &&
4081 flags |= SCF_TRIE_RESTUDY;
4082 if ( startbranch == first
4085 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4090 /* at this point we know whatever we have is a
4091 * NOTHING sequence/branch AND if 'startbranch'
4092 * is 'first' then we can turn the whole thing
4095 if ( startbranch == first ) {
4097 /* the entire thing is a NOTHING sequence,
4098 * something like this: (?:|) So we can
4099 * turn it into a plain NOTHING op. */
4100 DEBUG_TRIE_COMPILE_r({
4101 regprop(RExC_rx, mysv, cur, NULL);
4102 PerlIO_printf( Perl_debug_log,
4103 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4104 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4107 OP(startbranch)= NOTHING;
4108 NEXT_OFF(startbranch)= tail - startbranch;
4109 for ( opt= startbranch + 1; opt < tail ; opt++ )
4113 } /* end if ( last) */
4114 } /* TRIE_MAXBUF is non zero */
4119 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4120 scan = NEXTOPER(NEXTOPER(scan));
4121 } else /* single branch is optimized. */
4122 scan = NEXTOPER(scan);
4124 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4125 scan_frame *newframe = NULL;
4129 U32 my_recursed_depth= recursed_depth;
4131 if (OP(scan) != SUSPEND) {
4132 /* set the pointer */
4133 if (OP(scan) == GOSUB) {
4135 RExC_recurse[ARG2L(scan)] = scan;
4136 start = RExC_open_parens[paren-1];
4137 end = RExC_close_parens[paren-1];
4140 start = RExC_rxi->program + 1;
4145 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4147 if (!recursed_depth) {
4148 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4150 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4151 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4152 RExC_study_chunk_recursed_bytes, U8);
4154 /* we havent recursed into this paren yet, so recurse into it */
4155 DEBUG_STUDYDATA("set:", data,depth);
4156 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4157 my_recursed_depth= recursed_depth + 1;
4158 Newx(newframe,1,scan_frame);
4160 DEBUG_STUDYDATA("inf:", data,depth);
4161 /* some form of infinite recursion, assume infinite length
4163 if (flags & SCF_DO_SUBSTR) {
4164 scan_commit(pRExC_state, data, minlenp, is_inf);
4165 data->longest = &(data->longest_float);
4167 is_inf = is_inf_internal = 1;
4168 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4169 ssc_anything(data->start_class);
4170 flags &= ~SCF_DO_STCLASS;
4173 Newx(newframe,1,scan_frame);
4176 end = regnext(scan);
4181 SAVEFREEPV(newframe);
4182 newframe->next = regnext(scan);
4183 newframe->last = last;
4184 newframe->stop = stopparen;
4185 newframe->prev = frame;
4186 newframe->prev_recursed_depth = recursed_depth;
4188 DEBUG_STUDYDATA("frame-new:",data,depth);
4189 DEBUG_PEEP("fnew", scan, depth);
4196 recursed_depth= my_recursed_depth;
4201 else if (OP(scan) == EXACT) {
4202 SSize_t l = STR_LEN(scan);
4205 const U8 * const s = (U8*)STRING(scan);
4206 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4207 l = utf8_length(s, s + l);
4209 uc = *((U8*)STRING(scan));
4212 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4213 /* The code below prefers earlier match for fixed
4214 offset, later match for variable offset. */
4215 if (data->last_end == -1) { /* Update the start info. */
4216 data->last_start_min = data->pos_min;
4217 data->last_start_max = is_inf
4218 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4220 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4222 SvUTF8_on(data->last_found);
4224 SV * const sv = data->last_found;
4225 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4226 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4227 if (mg && mg->mg_len >= 0)
4228 mg->mg_len += utf8_length((U8*)STRING(scan),
4229 (U8*)STRING(scan)+STR_LEN(scan));
4231 data->last_end = data->pos_min + l;
4232 data->pos_min += l; /* As in the first entry. */
4233 data->flags &= ~SF_BEFORE_EOL;
4236 /* ANDing the code point leaves at most it, and not in locale, and
4237 * can't match null string */
4238 if (flags & SCF_DO_STCLASS_AND) {
4239 ssc_cp_and(data->start_class, uc);
4240 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4241 ssc_clear_locale(data->start_class);
4243 else if (flags & SCF_DO_STCLASS_OR) {
4244 ssc_add_cp(data->start_class, uc);
4245 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4247 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4248 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4250 flags &= ~SCF_DO_STCLASS;
4252 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
4253 SSize_t l = STR_LEN(scan);
4254 UV uc = *((U8*)STRING(scan));
4255 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4256 separate code points */
4258 /* Search for fixed substrings supports EXACT only. */
4259 if (flags & SCF_DO_SUBSTR) {
4261 scan_commit(pRExC_state, data, minlenp, is_inf);
4264 const U8 * const s = (U8 *)STRING(scan);
4265 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4266 l = utf8_length(s, s + l);
4268 if (unfolded_multi_char) {
4269 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4271 min += l - min_subtract;
4273 delta += min_subtract;
4274 if (flags & SCF_DO_SUBSTR) {
4275 data->pos_min += l - min_subtract;
4276 if (data->pos_min < 0) {
4279 data->pos_delta += min_subtract;
4281 data->longest = &(data->longest_float);
4284 if (OP(scan) == EXACTFL) {
4286 /* We don't know what the folds are; it could be anything. XXX
4287 * Actually, we only support UTF-8 encoding for code points
4288 * above Latin1, so we could know what those folds are. */
4289 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4293 else { /* Non-locale EXACTFish */
4294 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4295 if (flags & SCF_DO_STCLASS_AND) {
4296 ssc_clear_locale(data->start_class);
4298 if (uc < 256) { /* We know what the Latin1 folds are ... */
4299 if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
4300 know if anything folds
4302 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4303 PL_fold_latin1[uc]);
4304 if (OP(scan) != EXACTFA) { /* The folds below aren't
4306 if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
4308 = add_cp_to_invlist(EXACTF_invlist,
4309 LATIN_SMALL_LETTER_SHARP_S);
4311 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
4313 = add_cp_to_invlist(EXACTF_invlist, 's');
4315 = add_cp_to_invlist(EXACTF_invlist, 'S');
4319 /* We also know if there are above-Latin1 code points
4320 * that fold to this (none legal for ASCII and /iaa) */
4321 if ((! isASCII(uc) || OP(scan) != EXACTFA)
4322 && HAS_NONLATIN1_FOLD_CLOSURE(uc))
4324 /* XXX We could know exactly what does fold to this
4325 * if the reverse folds are loaded, as currently in
4327 _invlist_union(EXACTF_invlist,
4333 else { /* Non-locale, above Latin1. XXX We don't currently
4334 know what participates in folds with this, so have
4335 to assume anything could */
4337 /* XXX We could know exactly what does fold to this if the
4338 * reverse folds are loaded, as currently in S_regclass().
4339 * But we do know that under /iaa nothing in the ASCII
4340 * range can participate */
4341 if (OP(scan) == EXACTFA) {
4342 _invlist_union_complement_2nd(EXACTF_invlist,
4343 PL_XPosix_ptrs[_CC_ASCII],
4347 EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
4352 if (flags & SCF_DO_STCLASS_AND) {
4353 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4354 ANYOF_POSIXL_ZERO(data->start_class);
4355 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4357 else if (flags & SCF_DO_STCLASS_OR) {
4358 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4359 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4361 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4362 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4364 flags &= ~SCF_DO_STCLASS;
4365 SvREFCNT_dec(EXACTF_invlist);
4367 else if (REGNODE_VARIES(OP(scan))) {
4368 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4369 I32 fl = 0, f = flags;
4370 regnode * const oscan = scan;
4371 regnode_ssc this_class;
4372 regnode_ssc *oclass = NULL;
4373 I32 next_is_eval = 0;
4375 switch (PL_regkind[OP(scan)]) {
4376 case WHILEM: /* End of (?:...)* . */
4377 scan = NEXTOPER(scan);
4380 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4381 next = NEXTOPER(scan);
4382 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4384 maxcount = REG_INFTY;
4385 next = regnext(scan);
4386 scan = NEXTOPER(scan);
4390 if (flags & SCF_DO_SUBSTR)
4395 if (flags & SCF_DO_STCLASS) {
4397 maxcount = REG_INFTY;
4398 next = regnext(scan);
4399 scan = NEXTOPER(scan);
4402 if (flags & SCF_DO_SUBSTR) {
4403 scan_commit(pRExC_state, data, minlenp, is_inf);
4404 /* Cannot extend fixed substrings */
4405 data->longest = &(data->longest_float);
4407 is_inf = is_inf_internal = 1;
4408 scan = regnext(scan);
4409 goto optimize_curly_tail;
4411 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4412 && (scan->flags == stopparen))
4417 mincount = ARG1(scan);
4418 maxcount = ARG2(scan);
4420 next = regnext(scan);
4421 if (OP(scan) == CURLYX) {
4422 I32 lp = (data ? *(data->last_closep) : 0);
4423 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4425 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4426 next_is_eval = (OP(scan) == EVAL);
4428 if (flags & SCF_DO_SUBSTR) {
4430 scan_commit(pRExC_state, data, minlenp, is_inf);
4431 /* Cannot extend fixed substrings */
4432 pos_before = data->pos_min;
4436 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4438 data->flags |= SF_IS_INF;
4440 if (flags & SCF_DO_STCLASS) {
4441 ssc_init(pRExC_state, &this_class);
4442 oclass = data->start_class;
4443 data->start_class = &this_class;
4444 f |= SCF_DO_STCLASS_AND;
4445 f &= ~SCF_DO_STCLASS_OR;
4447 /* Exclude from super-linear cache processing any {n,m}
4448 regops for which the combination of input pos and regex
4449 pos is not enough information to determine if a match
4452 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4453 regex pos at the \s*, the prospects for a match depend not
4454 only on the input position but also on how many (bar\s*)
4455 repeats into the {4,8} we are. */
4456 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4457 f &= ~SCF_WHILEM_VISITED_POS;
4459 /* This will finish on WHILEM, setting scan, or on NULL: */
4460 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4461 last, data, stopparen, recursed_depth, NULL,
4463 ? (f & ~SCF_DO_SUBSTR)
4467 if (flags & SCF_DO_STCLASS)
4468 data->start_class = oclass;
4469 if (mincount == 0 || minnext == 0) {
4470 if (flags & SCF_DO_STCLASS_OR) {
4471 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4473 else if (flags & SCF_DO_STCLASS_AND) {
4474 /* Switch to OR mode: cache the old value of
4475 * data->start_class */
4477 StructCopy(data->start_class, and_withp, regnode_ssc);
4478 flags &= ~SCF_DO_STCLASS_AND;
4479 StructCopy(&this_class, data->start_class, regnode_ssc);
4480 flags |= SCF_DO_STCLASS_OR;
4481 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4483 } else { /* Non-zero len */
4484 if (flags & SCF_DO_STCLASS_OR) {
4485 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4486 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4488 else if (flags & SCF_DO_STCLASS_AND)
4489 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4490 flags &= ~SCF_DO_STCLASS;
4492 if (!scan) /* It was not CURLYX, but CURLY. */
4494 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4495 /* ? quantifier ok, except for (?{ ... }) */
4496 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4497 && (minnext == 0) && (deltanext == 0)
4498 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4499 && maxcount <= REG_INFTY/3) /* Complement check for big
4502 /* Fatal warnings may leak the regexp without this: */
4503 SAVEFREESV(RExC_rx_sv);
4504 ckWARNreg(RExC_parse,
4505 "Quantifier unexpected on zero-length expression");
4506 (void)ReREFCNT_inc(RExC_rx_sv);
4509 min += minnext * mincount;
4510 is_inf_internal |= deltanext == SSize_t_MAX
4511 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4512 is_inf |= is_inf_internal;
4514 delta = SSize_t_MAX;
4516 delta += (minnext + deltanext) * maxcount
4517 - minnext * mincount;
4519 /* Try powerful optimization CURLYX => CURLYN. */
4520 if ( OP(oscan) == CURLYX && data
4521 && data->flags & SF_IN_PAR
4522 && !(data->flags & SF_HAS_EVAL)
4523 && !deltanext && minnext == 1 ) {
4524 /* Try to optimize to CURLYN. */
4525 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4526 regnode * const nxt1 = nxt;
4533 if (!REGNODE_SIMPLE(OP(nxt))
4534 && !(PL_regkind[OP(nxt)] == EXACT
4535 && STR_LEN(nxt) == 1))
4541 if (OP(nxt) != CLOSE)
4543 if (RExC_open_parens) {
4544 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4545 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4547 /* Now we know that nxt2 is the only contents: */
4548 oscan->flags = (U8)ARG(nxt);
4550 OP(nxt1) = NOTHING; /* was OPEN. */
4553 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4554 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4555 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4556 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4557 OP(nxt + 1) = OPTIMIZED; /* was count. */
4558 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4563 /* Try optimization CURLYX => CURLYM. */
4564 if ( OP(oscan) == CURLYX && data
4565 && !(data->flags & SF_HAS_PAR)
4566 && !(data->flags & SF_HAS_EVAL)
4567 && !deltanext /* atom is fixed width */
4568 && minnext != 0 /* CURLYM can't handle zero width */
4570 /* Nor characters whose fold at run-time may be
4571 * multi-character */
4572 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4574 /* XXXX How to optimize if data == 0? */
4575 /* Optimize to a simpler form. */
4576 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4580 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4581 && (OP(nxt2) != WHILEM))
4583 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4584 /* Need to optimize away parenths. */
4585 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4586 /* Set the parenth number. */
4587 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4589 oscan->flags = (U8)ARG(nxt);
4590 if (RExC_open_parens) {
4591 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4592 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4594 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4595 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4598 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4599 OP(nxt + 1) = OPTIMIZED; /* was count. */
4600 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4601 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4604 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4605 regnode *nnxt = regnext(nxt1);
4607 if (reg_off_by_arg[OP(nxt1)])
4608 ARG_SET(nxt1, nxt2 - nxt1);
4609 else if (nxt2 - nxt1 < U16_MAX)
4610 NEXT_OFF(nxt1) = nxt2 - nxt1;
4612 OP(nxt) = NOTHING; /* Cannot beautify */
4617 /* Optimize again: */
4618 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4619 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4624 else if ((OP(oscan) == CURLYX)
4625 && (flags & SCF_WHILEM_VISITED_POS)
4626 /* See the comment on a similar expression above.
4627 However, this time it's not a subexpression
4628 we care about, but the expression itself. */
4629 && (maxcount == REG_INFTY)
4630 && data && ++data->whilem_c < 16) {
4631 /* This stays as CURLYX, we can put the count/of pair. */
4632 /* Find WHILEM (as in regexec.c) */
4633 regnode *nxt = oscan + NEXT_OFF(oscan);
4635 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4637 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4638 | (RExC_whilem_seen << 4)); /* On WHILEM */
4640 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4642 if (flags & SCF_DO_SUBSTR) {
4643 SV *last_str = NULL;
4644 STRLEN last_chrs = 0;
4645 int counted = mincount != 0;
4647 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4649 SSize_t b = pos_before >= data->last_start_min
4650 ? pos_before : data->last_start_min;
4652 const char * const s = SvPV_const(data->last_found, l);
4653 SSize_t old = b - data->last_start_min;
4656 old = utf8_hop((U8*)s, old) - (U8*)s;
4658 /* Get the added string: */
4659 last_str = newSVpvn_utf8(s + old, l, UTF);
4660 last_chrs = UTF ? utf8_length((U8*)(s + old),
4661 (U8*)(s + old + l)) : l;
4662 if (deltanext == 0 && pos_before == b) {
4663 /* What was added is a constant string */
4666 SvGROW(last_str, (mincount * l) + 1);
4667 repeatcpy(SvPVX(last_str) + l,
4668 SvPVX_const(last_str), l,
4670 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4671 /* Add additional parts. */
4672 SvCUR_set(data->last_found,
4673 SvCUR(data->last_found) - l);
4674 sv_catsv(data->last_found, last_str);
4676 SV * sv = data->last_found;
4678 SvUTF8(sv) && SvMAGICAL(sv) ?
4679 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4680 if (mg && mg->mg_len >= 0)
4681 mg->mg_len += last_chrs * (mincount-1);
4683 last_chrs *= mincount;
4684 data->last_end += l * (mincount - 1);
4687 /* start offset must point into the last copy */
4688 data->last_start_min += minnext * (mincount - 1);
4689 data->last_start_max += is_inf ? SSize_t_MAX
4690 : (maxcount - 1) * (minnext + data->pos_delta);
4693 /* It is counted once already... */
4694 data->pos_min += minnext * (mincount - counted);
4696 PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
4697 " SSize_t_MAX=%"UVdf" minnext=%"UVdf
4698 " maxcount=%"UVdf" mincount=%"UVdf"\n",
4699 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4701 if (deltanext != SSize_t_MAX)
4702 PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
4703 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4704 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4706 if (deltanext == SSize_t_MAX
4707 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4708 data->pos_delta = SSize_t_MAX;
4710 data->pos_delta += - counted * deltanext +
4711 (minnext + deltanext) * maxcount - minnext * mincount;
4712 if (mincount != maxcount) {
4713 /* Cannot extend fixed substrings found inside
4715 scan_commit(pRExC_state, data, minlenp, is_inf);
4716 if (mincount && last_str) {
4717 SV * const sv = data->last_found;
4718 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4719 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4723 sv_setsv(sv, last_str);
4724 data->last_end = data->pos_min;
4725 data->last_start_min = data->pos_min - last_chrs;
4726 data->last_start_max = is_inf
4728 : data->pos_min + data->pos_delta - last_chrs;
4730 data->longest = &(data->longest_float);
4732 SvREFCNT_dec(last_str);
4734 if (data && (fl & SF_HAS_EVAL))
4735 data->flags |= SF_HAS_EVAL;
4736 optimize_curly_tail:
4737 if (OP(oscan) != CURLYX) {
4738 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4740 NEXT_OFF(oscan) += NEXT_OFF(next);
4746 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4751 if (flags & SCF_DO_SUBSTR) {
4752 /* Cannot expect anything... */
4753 scan_commit(pRExC_state, data, minlenp, is_inf);
4754 data->longest = &(data->longest_float);
4756 is_inf = is_inf_internal = 1;
4757 if (flags & SCF_DO_STCLASS_OR) {
4758 if (OP(scan) == CLUMP) {
4759 /* Actually is any start char, but very few code points
4760 * aren't start characters */
4761 ssc_match_all_cp(data->start_class);
4764 ssc_anything(data->start_class);
4767 flags &= ~SCF_DO_STCLASS;
4771 else if (OP(scan) == LNBREAK) {
4772 if (flags & SCF_DO_STCLASS) {
4773 if (flags & SCF_DO_STCLASS_AND) {
4774 ssc_intersection(data->start_class,
4775 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4776 ssc_clear_locale(data->start_class);
4777 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4779 else if (flags & SCF_DO_STCLASS_OR) {
4780 ssc_union(data->start_class,
4781 PL_XPosix_ptrs[_CC_VERTSPACE],
4783 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4785 /* See commit msg for
4786 * 749e076fceedeb708a624933726e7989f2302f6a */
4787 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4789 flags &= ~SCF_DO_STCLASS;
4792 delta++; /* Because of the 2 char string cr-lf */
4793 if (flags & SCF_DO_SUBSTR) {
4794 /* Cannot expect anything... */
4795 scan_commit(pRExC_state, data, minlenp, is_inf);
4797 data->pos_delta += 1;
4798 data->longest = &(data->longest_float);
4801 else if (REGNODE_SIMPLE(OP(scan))) {
4803 if (flags & SCF_DO_SUBSTR) {
4804 scan_commit(pRExC_state, data, minlenp, is_inf);
4808 if (flags & SCF_DO_STCLASS) {
4810 SV* my_invlist = NULL;
4813 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4814 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4816 /* Some of the logic below assumes that switching
4817 locale on will only add false positives. */
4822 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4827 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4828 ssc_match_all_cp(data->start_class);
4833 SV* REG_ANY_invlist = _new_invlist(2);
4834 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4836 if (flags & SCF_DO_STCLASS_OR) {
4837 ssc_union(data->start_class,
4839 TRUE /* TRUE => invert, hence all but \n
4843 else if (flags & SCF_DO_STCLASS_AND) {
4844 ssc_intersection(data->start_class,
4846 TRUE /* TRUE => invert */
4848 ssc_clear_locale(data->start_class);
4850 SvREFCNT_dec_NN(REG_ANY_invlist);
4855 if (flags & SCF_DO_STCLASS_AND)
4856 ssc_and(pRExC_state, data->start_class,
4857 (regnode_charclass *) scan);
4859 ssc_or(pRExC_state, data->start_class,
4860 (regnode_charclass *) scan);
4868 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4869 if (flags & SCF_DO_STCLASS_AND) {
4870 bool was_there = cBOOL(
4871 ANYOF_POSIXL_TEST(data->start_class,
4873 ANYOF_POSIXL_ZERO(data->start_class);
4874 if (was_there) { /* Do an AND */
4875 ANYOF_POSIXL_SET(data->start_class, namedclass);
4877 /* No individual code points can now match */
4878 data->start_class->invlist
4879 = sv_2mortal(_new_invlist(0));
4882 int complement = namedclass + ((invert) ? -1 : 1);
4884 assert(flags & SCF_DO_STCLASS_OR);
4886 /* If the complement of this class was already there,
4887 * the result is that they match all code points,
4888 * (\d + \D == everything). Remove the classes from
4889 * future consideration. Locale is not relevant in
4891 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4892 ssc_match_all_cp(data->start_class);
4893 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4894 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4896 else { /* The usual case; just add this class to the
4898 ANYOF_POSIXL_SET(data->start_class, namedclass);
4903 case NPOSIXA: /* For these, we always know the exact set of
4908 if (FLAGS(scan) == _CC_ASCII) {
4909 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
4912 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4913 PL_XPosix_ptrs[_CC_ASCII],
4924 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
4926 /* NPOSIXD matches all upper Latin1 code points unless the
4927 * target string being matched is UTF-8, which is
4928 * unknowable until match time. Since we are going to
4929 * invert, we want to get rid of all of them so that the
4930 * inversion will match all */
4931 if (OP(scan) == NPOSIXD) {
4932 _invlist_subtract(my_invlist, PL_UpperLatin1,
4938 if (flags & SCF_DO_STCLASS_AND) {
4939 ssc_intersection(data->start_class, my_invlist, invert);
4940 ssc_clear_locale(data->start_class);
4943 assert(flags & SCF_DO_STCLASS_OR);
4944 ssc_union(data->start_class, my_invlist, invert);
4946 SvREFCNT_dec(my_invlist);
4948 if (flags & SCF_DO_STCLASS_OR)
4949 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4950 flags &= ~SCF_DO_STCLASS;
4953 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4954 data->flags |= (OP(scan) == MEOL
4957 scan_commit(pRExC_state, data, minlenp, is_inf);
4960 else if ( PL_regkind[OP(scan)] == BRANCHJ
4961 /* Lookbehind, or need to calculate parens/evals/stclass: */
4962 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4963 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4964 if ( OP(scan) == UNLESSM &&
4966 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
4967 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
4970 regnode *upto= regnext(scan);
4972 SV * const mysv_val=sv_newmortal();
4973 DEBUG_STUDYDATA("OPFAIL",data,depth);
4975 /*DEBUG_PARSE_MSG("opfail");*/
4976 regprop(RExC_rx, mysv_val, upto, NULL);
4977 PerlIO_printf(Perl_debug_log,
4978 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
4979 SvPV_nolen_const(mysv_val),
4980 (IV)REG_NODE_NUM(upto),
4985 NEXT_OFF(scan) = upto - scan;
4986 for (opt= scan + 1; opt < upto ; opt++)
4987 OP(opt) = OPTIMIZED;
4991 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4992 || OP(scan) == UNLESSM )
4994 /* Negative Lookahead/lookbehind
4995 In this case we can't do fixed string optimisation.
4998 SSize_t deltanext, minnext, fake = 0;
5003 data_fake.flags = 0;
5005 data_fake.whilem_c = data->whilem_c;
5006 data_fake.last_closep = data->last_closep;
5009 data_fake.last_closep = &fake;
5010 data_fake.pos_delta = delta;
5011 if ( flags & SCF_DO_STCLASS && !scan->flags
5012 && OP(scan) == IFMATCH ) { /* Lookahead */
5013 ssc_init(pRExC_state, &intrnl);
5014 data_fake.start_class = &intrnl;
5015 f |= SCF_DO_STCLASS_AND;
5017 if (flags & SCF_WHILEM_VISITED_POS)
5018 f |= SCF_WHILEM_VISITED_POS;
5019 next = regnext(scan);
5020 nscan = NEXTOPER(NEXTOPER(scan));
5021 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5022 last, &data_fake, stopparen,
5023 recursed_depth, NULL, f, depth+1);
5026 FAIL("Variable length lookbehind not implemented");
5028 else if (minnext > (I32)U8_MAX) {
5029 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5032 scan->flags = (U8)minnext;
5035 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5037 if (data_fake.flags & SF_HAS_EVAL)
5038 data->flags |= SF_HAS_EVAL;
5039 data->whilem_c = data_fake.whilem_c;
5041 if (f & SCF_DO_STCLASS_AND) {
5042 if (flags & SCF_DO_STCLASS_OR) {
5043 /* OR before, AND after: ideally we would recurse with
5044 * data_fake to get the AND applied by study of the
5045 * remainder of the pattern, and then derecurse;
5046 * *** HACK *** for now just treat as "no information".
5047 * See [perl #56690].
5049 ssc_init(pRExC_state, data->start_class);
5051 /* AND before and after: combine and continue. These
5052 * assertions are zero-length, so can match an EMPTY
5054 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5055 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5059 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5061 /* Positive Lookahead/lookbehind
5062 In this case we can do fixed string optimisation,
5063 but we must be careful about it. Note in the case of
5064 lookbehind the positions will be offset by the minimum
5065 length of the pattern, something we won't know about
5066 until after the recurse.
5068 SSize_t deltanext, fake = 0;
5072 /* We use SAVEFREEPV so that when the full compile
5073 is finished perl will clean up the allocated
5074 minlens when it's all done. This way we don't
5075 have to worry about freeing them when we know
5076 they wont be used, which would be a pain.
5079 Newx( minnextp, 1, SSize_t );
5080 SAVEFREEPV(minnextp);
5083 StructCopy(data, &data_fake, scan_data_t);
5084 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5087 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5088 data_fake.last_found=newSVsv(data->last_found);
5092 data_fake.last_closep = &fake;
5093 data_fake.flags = 0;
5094 data_fake.pos_delta = delta;
5096 data_fake.flags |= SF_IS_INF;
5097 if ( flags & SCF_DO_STCLASS && !scan->flags
5098 && OP(scan) == IFMATCH ) { /* Lookahead */
5099 ssc_init(pRExC_state, &intrnl);
5100 data_fake.start_class = &intrnl;
5101 f |= SCF_DO_STCLASS_AND;
5103 if (flags & SCF_WHILEM_VISITED_POS)
5104 f |= SCF_WHILEM_VISITED_POS;
5105 next = regnext(scan);
5106 nscan = NEXTOPER(NEXTOPER(scan));
5108 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5109 &deltanext, last, &data_fake,
5110 stopparen, recursed_depth, NULL,
5114 FAIL("Variable length lookbehind not implemented");
5116 else if (*minnextp > (I32)U8_MAX) {
5117 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5120 scan->flags = (U8)*minnextp;
5125 if (f & SCF_DO_STCLASS_AND) {
5126 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5127 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5130 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5132 if (data_fake.flags & SF_HAS_EVAL)
5133 data->flags |= SF_HAS_EVAL;
5134 data->whilem_c = data_fake.whilem_c;
5135 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5136 if (RExC_rx->minlen<*minnextp)
5137 RExC_rx->minlen=*minnextp;
5138 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5139 SvREFCNT_dec_NN(data_fake.last_found);
5141 if ( data_fake.minlen_fixed != minlenp )
5143 data->offset_fixed= data_fake.offset_fixed;
5144 data->minlen_fixed= data_fake.minlen_fixed;
5145 data->lookbehind_fixed+= scan->flags;
5147 if ( data_fake.minlen_float != minlenp )
5149 data->minlen_float= data_fake.minlen_float;
5150 data->offset_float_min=data_fake.offset_float_min;
5151 data->offset_float_max=data_fake.offset_float_max;
5152 data->lookbehind_float+= scan->flags;
5159 else if (OP(scan) == OPEN) {
5160 if (stopparen != (I32)ARG(scan))
5163 else if (OP(scan) == CLOSE) {
5164 if (stopparen == (I32)ARG(scan)) {
5167 if ((I32)ARG(scan) == is_par) {
5168 next = regnext(scan);
5170 if ( next && (OP(next) != WHILEM) && next < last)
5171 is_par = 0; /* Disable optimization */
5174 *(data->last_closep) = ARG(scan);
5176 else if (OP(scan) == EVAL) {
5178 data->flags |= SF_HAS_EVAL;
5180 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5181 if (flags & SCF_DO_SUBSTR) {
5182 scan_commit(pRExC_state, data, minlenp, is_inf);
5183 flags &= ~SCF_DO_SUBSTR;
5185 if (data && OP(scan)==ACCEPT) {
5186 data->flags |= SCF_SEEN_ACCEPT;
5191 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5193 if (flags & SCF_DO_SUBSTR) {
5194 scan_commit(pRExC_state, data, minlenp, is_inf);
5195 data->longest = &(data->longest_float);
5197 is_inf = is_inf_internal = 1;
5198 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5199 ssc_anything(data->start_class);
5200 flags &= ~SCF_DO_STCLASS;
5202 else if (OP(scan) == GPOS) {
5203 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5204 !(delta || is_inf || (data && data->pos_delta)))
5206 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5207 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5208 if (RExC_rx->gofs < (STRLEN)min)
5209 RExC_rx->gofs = min;
5211 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5215 #ifdef TRIE_STUDY_OPT
5216 #ifdef FULL_TRIE_STUDY
5217 else if (PL_regkind[OP(scan)] == TRIE) {
5218 /* NOTE - There is similar code to this block above for handling
5219 BRANCH nodes on the initial study. If you change stuff here
5221 regnode *trie_node= scan;
5222 regnode *tail= regnext(scan);
5223 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5224 SSize_t max1 = 0, min1 = SSize_t_MAX;
5227 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5228 /* Cannot merge strings after this. */
5229 scan_commit(pRExC_state, data, minlenp, is_inf);
5231 if (flags & SCF_DO_STCLASS)
5232 ssc_init_zero(pRExC_state, &accum);
5238 const regnode *nextbranch= NULL;
5241 for ( word=1 ; word <= trie->wordcount ; word++)
5243 SSize_t deltanext=0, minnext=0, f = 0, fake;
5244 regnode_ssc this_class;
5246 data_fake.flags = 0;
5248 data_fake.whilem_c = data->whilem_c;
5249 data_fake.last_closep = data->last_closep;
5252 data_fake.last_closep = &fake;
5253 data_fake.pos_delta = delta;
5254 if (flags & SCF_DO_STCLASS) {
5255 ssc_init(pRExC_state, &this_class);
5256 data_fake.start_class = &this_class;
5257 f = SCF_DO_STCLASS_AND;
5259 if (flags & SCF_WHILEM_VISITED_POS)
5260 f |= SCF_WHILEM_VISITED_POS;
5262 if (trie->jump[word]) {
5264 nextbranch = trie_node + trie->jump[0];
5265 scan= trie_node + trie->jump[word];
5266 /* We go from the jump point to the branch that follows
5267 it. Note this means we need the vestigal unused
5268 branches even though they arent otherwise used. */
5269 minnext = study_chunk(pRExC_state, &scan, minlenp,
5270 &deltanext, (regnode *)nextbranch, &data_fake,
5271 stopparen, recursed_depth, NULL, f,depth+1);
5273 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5274 nextbranch= regnext((regnode*)nextbranch);
5276 if (min1 > (SSize_t)(minnext + trie->minlen))
5277 min1 = minnext + trie->minlen;
5278 if (deltanext == SSize_t_MAX) {
5279 is_inf = is_inf_internal = 1;
5281 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5282 max1 = minnext + deltanext + trie->maxlen;
5284 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5286 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5287 if ( stopmin > min + min1)
5288 stopmin = min + min1;
5289 flags &= ~SCF_DO_SUBSTR;
5291 data->flags |= SCF_SEEN_ACCEPT;
5294 if (data_fake.flags & SF_HAS_EVAL)
5295 data->flags |= SF_HAS_EVAL;
5296 data->whilem_c = data_fake.whilem_c;
5298 if (flags & SCF_DO_STCLASS)
5299 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5302 if (flags & SCF_DO_SUBSTR) {
5303 data->pos_min += min1;
5304 data->pos_delta += max1 - min1;
5305 if (max1 != min1 || is_inf)
5306 data->longest = &(data->longest_float);
5309 delta += max1 - min1;
5310 if (flags & SCF_DO_STCLASS_OR) {
5311 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5313 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5314 flags &= ~SCF_DO_STCLASS;
5317 else if (flags & SCF_DO_STCLASS_AND) {
5319 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5320 flags &= ~SCF_DO_STCLASS;
5323 /* Switch to OR mode: cache the old value of
5324 * data->start_class */
5326 StructCopy(data->start_class, and_withp, regnode_ssc);
5327 flags &= ~SCF_DO_STCLASS_AND;
5328 StructCopy(&accum, data->start_class, regnode_ssc);
5329 flags |= SCF_DO_STCLASS_OR;
5336 else if (PL_regkind[OP(scan)] == TRIE) {
5337 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5340 min += trie->minlen;
5341 delta += (trie->maxlen - trie->minlen);
5342 flags &= ~SCF_DO_STCLASS; /* xxx */
5343 if (flags & SCF_DO_SUBSTR) {
5344 /* Cannot expect anything... */
5345 scan_commit(pRExC_state, data, minlenp, is_inf);
5346 data->pos_min += trie->minlen;
5347 data->pos_delta += (trie->maxlen - trie->minlen);
5348 if (trie->maxlen != trie->minlen)
5349 data->longest = &(data->longest_float);
5351 if (trie->jump) /* no more substrings -- for now /grr*/
5352 flags &= ~SCF_DO_SUBSTR;
5354 #endif /* old or new */
5355 #endif /* TRIE_STUDY_OPT */
5357 /* Else: zero-length, ignore. */
5358 scan = regnext(scan);
5360 /* If we are exiting a recursion we can unset its recursed bit
5361 * and allow ourselves to enter it again - no danger of an
5362 * infinite loop there.
5363 if (stopparen > -1 && recursed) {
5364 DEBUG_STUDYDATA("unset:", data,depth);
5365 PAREN_UNSET( recursed, stopparen);
5369 DEBUG_STUDYDATA("frame-end:",data,depth);
5370 DEBUG_PEEP("fend", scan, depth);
5371 /* restore previous context */
5374 stopparen = frame->stop;
5375 recursed_depth = frame->prev_recursed_depth;
5378 frame = frame->prev;
5379 goto fake_study_recurse;
5384 DEBUG_STUDYDATA("pre-fin:",data,depth);
5387 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5389 if (flags & SCF_DO_SUBSTR && is_inf)
5390 data->pos_delta = SSize_t_MAX - data->pos_min;
5391 if (is_par > (I32)U8_MAX)
5393 if (is_par && pars==1 && data) {
5394 data->flags |= SF_IN_PAR;
5395 data->flags &= ~SF_HAS_PAR;
5397 else if (pars && data) {
5398 data->flags |= SF_HAS_PAR;
5399 data->flags &= ~SF_IN_PAR;
5401 if (flags & SCF_DO_STCLASS_OR)
5402 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5403 if (flags & SCF_TRIE_RESTUDY)
5404 data->flags |= SCF_TRIE_RESTUDY;
5406 DEBUG_STUDYDATA("post-fin:",data,depth);
5409 SSize_t final_minlen= min < stopmin ? min : stopmin;
5411 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5412 RExC_maxlen = final_minlen + delta;
5414 return final_minlen;
5420 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5422 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5424 PERL_ARGS_ASSERT_ADD_DATA;
5426 Renewc(RExC_rxi->data,
5427 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5428 char, struct reg_data);
5430 Renew(RExC_rxi->data->what, count + n, U8);
5432 Newx(RExC_rxi->data->what, n, U8);
5433 RExC_rxi->data->count = count + n;
5434 Copy(s, RExC_rxi->data->what + count, n, U8);
5438 /*XXX: todo make this not included in a non debugging perl */
5439 #ifndef PERL_IN_XSUB_RE
5441 Perl_reginitcolors(pTHX)
5444 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5446 char *t = savepv(s);
5450 t = strchr(t, '\t');
5456 PL_colors[i] = t = (char *)"";
5461 PL_colors[i++] = (char *)"";
5468 #ifdef TRIE_STUDY_OPT
5469 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5472 (data.flags & SCF_TRIE_RESTUDY) \
5480 #define CHECK_RESTUDY_GOTO_butfirst
5484 * pregcomp - compile a regular expression into internal code
5486 * Decides which engine's compiler to call based on the hint currently in
5490 #ifndef PERL_IN_XSUB_RE
5492 /* return the currently in-scope regex engine (or the default if none) */
5494 regexp_engine const *
5495 Perl_current_re_engine(pTHX)
5499 if (IN_PERL_COMPILETIME) {
5500 HV * const table = GvHV(PL_hintgv);
5503 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5504 return &reh_regexp_engine;
5505 ptr = hv_fetchs(table, "regcomp", FALSE);
5506 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5507 return &reh_regexp_engine;
5508 return INT2PTR(regexp_engine*,SvIV(*ptr));
5512 if (!PL_curcop->cop_hints_hash)
5513 return &reh_regexp_engine;
5514 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5515 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5516 return &reh_regexp_engine;
5517 return INT2PTR(regexp_engine*,SvIV(ptr));
5523 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5526 regexp_engine const *eng = current_re_engine();
5527 GET_RE_DEBUG_FLAGS_DECL;
5529 PERL_ARGS_ASSERT_PREGCOMP;
5531 /* Dispatch a request to compile a regexp to correct regexp engine. */
5533 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5536 return CALLREGCOMP_ENG(eng, pattern, flags);
5540 /* public(ish) entry point for the perl core's own regex compiling code.
5541 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5542 * pattern rather than a list of OPs, and uses the internal engine rather
5543 * than the current one */
5546 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5548 SV *pat = pattern; /* defeat constness! */
5549 PERL_ARGS_ASSERT_RE_COMPILE;
5550 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5551 #ifdef PERL_IN_XSUB_RE
5556 NULL, NULL, rx_flags, 0);
5560 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5561 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5562 * point to the realloced string and length.
5564 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5568 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5569 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5571 U8 *const src = (U8*)*pat_p;
5574 STRLEN s = 0, d = 0;
5576 GET_RE_DEBUG_FLAGS_DECL;
5578 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5579 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5581 Newx(dst, *plen_p * 2 + 1, U8);
5583 while (s < *plen_p) {
5584 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5587 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5588 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5590 if (n < num_code_blocks) {
5591 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5592 pRExC_state->code_blocks[n].start = d;
5593 assert(dst[d] == '(');
5596 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5597 pRExC_state->code_blocks[n].end = d;
5598 assert(dst[d] == ')');
5608 *pat_p = (char*) dst;
5610 RExC_orig_utf8 = RExC_utf8 = 1;
5615 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5616 * while recording any code block indices, and handling overloading,
5617 * nested qr// objects etc. If pat is null, it will allocate a new
5618 * string, or just return the first arg, if there's only one.
5620 * Returns the malloced/updated pat.
5621 * patternp and pat_count is the array of SVs to be concatted;
5622 * oplist is the optional list of ops that generated the SVs;
5623 * recompile_p is a pointer to a boolean that will be set if
5624 * the regex will need to be recompiled.
5625 * delim, if non-null is an SV that will be inserted between each element
5629 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5630 SV *pat, SV ** const patternp, int pat_count,
5631 OP *oplist, bool *recompile_p, SV *delim)
5635 bool use_delim = FALSE;
5636 bool alloced = FALSE;
5638 /* if we know we have at least two args, create an empty string,
5639 * then concatenate args to that. For no args, return an empty string */
5640 if (!pat && pat_count != 1) {
5641 pat = newSVpvn("", 0);
5646 for (svp = patternp; svp < patternp + pat_count; svp++) {
5649 STRLEN orig_patlen = 0;
5651 SV *msv = use_delim ? delim : *svp;
5652 if (!msv) msv = &PL_sv_undef;
5654 /* if we've got a delimiter, we go round the loop twice for each
5655 * svp slot (except the last), using the delimiter the second
5664 if (SvTYPE(msv) == SVt_PVAV) {
5665 /* we've encountered an interpolated array within
5666 * the pattern, e.g. /...@a..../. Expand the list of elements,
5667 * then recursively append elements.
5668 * The code in this block is based on S_pushav() */
5670 AV *const av = (AV*)msv;
5671 const SSize_t maxarg = AvFILL(av) + 1;
5675 assert(oplist->op_type == OP_PADAV
5676 || oplist->op_type == OP_RV2AV);
5677 oplist = oplist->op_sibling;;
5680 if (SvRMAGICAL(av)) {
5683 Newx(array, maxarg, SV*);
5685 for (i=0; i < maxarg; i++) {
5686 SV ** const svp = av_fetch(av, i, FALSE);
5687 array[i] = svp ? *svp : &PL_sv_undef;
5691 array = AvARRAY(av);
5693 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5694 array, maxarg, NULL, recompile_p,
5696 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5702 /* we make the assumption here that each op in the list of
5703 * op_siblings maps to one SV pushed onto the stack,
5704 * except for code blocks, with have both an OP_NULL and
5706 * This allows us to match up the list of SVs against the
5707 * list of OPs to find the next code block.
5709 * Note that PUSHMARK PADSV PADSV ..
5711 * PADRANGE PADSV PADSV ..
5712 * so the alignment still works. */
5715 if (oplist->op_type == OP_NULL
5716 && (oplist->op_flags & OPf_SPECIAL))
5718 assert(n < pRExC_state->num_code_blocks);
5719 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5720 pRExC_state->code_blocks[n].block = oplist;
5721 pRExC_state->code_blocks[n].src_regex = NULL;
5724 oplist = oplist->op_sibling; /* skip CONST */
5727 oplist = oplist->op_sibling;;
5730 /* apply magic and QR overloading to arg */
5733 if (SvROK(msv) && SvAMAGIC(msv)) {
5734 SV *sv = AMG_CALLunary(msv, regexp_amg);
5738 if (SvTYPE(sv) != SVt_REGEXP)
5739 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5744 /* try concatenation overload ... */
5745 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5746 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5749 /* overloading involved: all bets are off over literal
5750 * code. Pretend we haven't seen it */
5751 pRExC_state->num_code_blocks -= n;
5755 /* ... or failing that, try "" overload */
5756 while (SvAMAGIC(msv)
5757 && (sv = AMG_CALLunary(msv, string_amg))
5761 && SvRV(msv) == SvRV(sv))
5766 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5770 /* this is a partially unrolled
5771 * sv_catsv_nomg(pat, msv);
5772 * that allows us to adjust code block indices if
5775 char *dst = SvPV_force_nomg(pat, dlen);
5777 if (SvUTF8(msv) && !SvUTF8(pat)) {
5778 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5779 sv_setpvn(pat, dst, dlen);
5782 sv_catsv_nomg(pat, msv);
5789 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5792 /* extract any code blocks within any embedded qr//'s */
5793 if (rx && SvTYPE(rx) == SVt_REGEXP
5794 && RX_ENGINE((REGEXP*)rx)->op_comp)
5797 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5798 if (ri->num_code_blocks) {
5800 /* the presence of an embedded qr// with code means
5801 * we should always recompile: the text of the
5802 * qr// may not have changed, but it may be a
5803 * different closure than last time */
5805 Renew(pRExC_state->code_blocks,
5806 pRExC_state->num_code_blocks + ri->num_code_blocks,
5807 struct reg_code_block);
5808 pRExC_state->num_code_blocks += ri->num_code_blocks;
5810 for (i=0; i < ri->num_code_blocks; i++) {
5811 struct reg_code_block *src, *dst;
5812 STRLEN offset = orig_patlen
5813 + ReANY((REGEXP *)rx)->pre_prefix;
5814 assert(n < pRExC_state->num_code_blocks);
5815 src = &ri->code_blocks[i];
5816 dst = &pRExC_state->code_blocks[n];
5817 dst->start = src->start + offset;
5818 dst->end = src->end + offset;
5819 dst->block = src->block;
5820 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5829 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5838 /* see if there are any run-time code blocks in the pattern.
5839 * False positives are allowed */
5842 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5843 char *pat, STRLEN plen)
5848 for (s = 0; s < plen; s++) {
5849 if (n < pRExC_state->num_code_blocks
5850 && s == pRExC_state->code_blocks[n].start)
5852 s = pRExC_state->code_blocks[n].end;
5856 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5858 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5860 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5867 /* Handle run-time code blocks. We will already have compiled any direct
5868 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5869 * copy of it, but with any literal code blocks blanked out and
5870 * appropriate chars escaped; then feed it into
5872 * eval "qr'modified_pattern'"
5876 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5880 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5882 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5883 * and merge them with any code blocks of the original regexp.
5885 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5886 * instead, just save the qr and return FALSE; this tells our caller that
5887 * the original pattern needs upgrading to utf8.
5891 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5892 char *pat, STRLEN plen)
5896 GET_RE_DEBUG_FLAGS_DECL;
5898 if (pRExC_state->runtime_code_qr) {
5899 /* this is the second time we've been called; this should
5900 * only happen if the main pattern got upgraded to utf8
5901 * during compilation; re-use the qr we compiled first time
5902 * round (which should be utf8 too)
5904 qr = pRExC_state->runtime_code_qr;
5905 pRExC_state->runtime_code_qr = NULL;
5906 assert(RExC_utf8 && SvUTF8(qr));
5912 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5916 /* determine how many extra chars we need for ' and \ escaping */
5917 for (s = 0; s < plen; s++) {
5918 if (pat[s] == '\'' || pat[s] == '\\')
5922 Newx(newpat, newlen, char);
5924 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
5926 for (s = 0; s < plen; s++) {
5927 if (n < pRExC_state->num_code_blocks
5928 && s == pRExC_state->code_blocks[n].start)
5930 /* blank out literal code block */
5931 assert(pat[s] == '(');
5932 while (s <= pRExC_state->code_blocks[n].end) {
5940 if (pat[s] == '\'' || pat[s] == '\\')
5945 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
5949 PerlIO_printf(Perl_debug_log,
5950 "%sre-parsing pattern for runtime code:%s %s\n",
5951 PL_colors[4],PL_colors[5],newpat);
5954 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
5960 PUSHSTACKi(PERLSI_REQUIRE);
5961 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
5962 * parsing qr''; normally only q'' does this. It also alters
5964 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
5965 SvREFCNT_dec_NN(sv);
5970 SV * const errsv = ERRSV;
5971 if (SvTRUE_NN(errsv))
5973 Safefree(pRExC_state->code_blocks);
5974 /* use croak_sv ? */
5975 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
5978 assert(SvROK(qr_ref));
5980 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
5981 /* the leaving below frees the tmp qr_ref.
5982 * Give qr a life of its own */
5990 if (!RExC_utf8 && SvUTF8(qr)) {
5991 /* first time through; the pattern got upgraded; save the
5992 * qr for the next time through */
5993 assert(!pRExC_state->runtime_code_qr);
5994 pRExC_state->runtime_code_qr = qr;
5999 /* extract any code blocks within the returned qr// */
6002 /* merge the main (r1) and run-time (r2) code blocks into one */
6004 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6005 struct reg_code_block *new_block, *dst;
6006 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6009 if (!r2->num_code_blocks) /* we guessed wrong */
6011 SvREFCNT_dec_NN(qr);
6016 r1->num_code_blocks + r2->num_code_blocks,
6017 struct reg_code_block);
6020 while ( i1 < r1->num_code_blocks
6021 || i2 < r2->num_code_blocks)
6023 struct reg_code_block *src;
6026 if (i1 == r1->num_code_blocks) {
6027 src = &r2->code_blocks[i2++];
6030 else if (i2 == r2->num_code_blocks)
6031 src = &r1->code_blocks[i1++];
6032 else if ( r1->code_blocks[i1].start
6033 < r2->code_blocks[i2].start)
6035 src = &r1->code_blocks[i1++];
6036 assert(src->end < r2->code_blocks[i2].start);
6039 assert( r1->code_blocks[i1].start
6040 > r2->code_blocks[i2].start);
6041 src = &r2->code_blocks[i2++];
6043 assert(src->end < r1->code_blocks[i1].start);
6046 assert(pat[src->start] == '(');
6047 assert(pat[src->end] == ')');
6048 dst->start = src->start;
6049 dst->end = src->end;
6050 dst->block = src->block;
6051 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6055 r1->num_code_blocks += r2->num_code_blocks;
6056 Safefree(r1->code_blocks);
6057 r1->code_blocks = new_block;
6060 SvREFCNT_dec_NN(qr);
6066 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6067 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6068 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6069 STRLEN longest_length, bool eol, bool meol)
6071 /* This is the common code for setting up the floating and fixed length
6072 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6073 * as to whether succeeded or not */
6078 if (! (longest_length
6079 || (eol /* Can't have SEOL and MULTI */
6080 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6082 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6083 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6088 /* copy the information about the longest from the reg_scan_data
6089 over to the program. */
6090 if (SvUTF8(sv_longest)) {
6091 *rx_utf8 = sv_longest;
6094 *rx_substr = sv_longest;
6097 /* end_shift is how many chars that must be matched that
6098 follow this item. We calculate it ahead of time as once the
6099 lookbehind offset is added in we lose the ability to correctly
6101 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6102 *rx_end_shift = ml - offset
6103 - longest_length + (SvTAIL(sv_longest) != 0)
6106 t = (eol/* Can't have SEOL and MULTI */
6107 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6108 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6114 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6115 * regular expression into internal code.
6116 * The pattern may be passed either as:
6117 * a list of SVs (patternp plus pat_count)
6118 * a list of OPs (expr)
6119 * If both are passed, the SV list is used, but the OP list indicates
6120 * which SVs are actually pre-compiled code blocks
6122 * The SVs in the list have magic and qr overloading applied to them (and
6123 * the list may be modified in-place with replacement SVs in the latter
6126 * If the pattern hasn't changed from old_re, then old_re will be
6129 * eng is the current engine. If that engine has an op_comp method, then
6130 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6131 * do the initial concatenation of arguments and pass on to the external
6134 * If is_bare_re is not null, set it to a boolean indicating whether the
6135 * arg list reduced (after overloading) to a single bare regex which has
6136 * been returned (i.e. /$qr/).
6138 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6140 * pm_flags contains the PMf_* flags, typically based on those from the
6141 * pm_flags field of the related PMOP. Currently we're only interested in
6142 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6144 * We can't allocate space until we know how big the compiled form will be,
6145 * but we can't compile it (and thus know how big it is) until we've got a
6146 * place to put the code. So we cheat: we compile it twice, once with code
6147 * generation turned off and size counting turned on, and once "for real".
6148 * This also means that we don't allocate space until we are sure that the
6149 * thing really will compile successfully, and we never have to move the
6150 * code and thus invalidate pointers into it. (Note that it has to be in
6151 * one piece because free() must be able to free it all.) [NB: not true in perl]
6153 * Beware that the optimization-preparation code in here knows about some
6154 * of the structure of the compiled regexp. [I'll say.]
6158 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6159 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6160 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6165 regexp_internal *ri;
6173 SV *code_blocksv = NULL;
6174 SV** new_patternp = patternp;
6176 /* these are all flags - maybe they should be turned
6177 * into a single int with different bit masks */
6178 I32 sawlookahead = 0;
6183 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6185 bool runtime_code = 0;
6187 RExC_state_t RExC_state;
6188 RExC_state_t * const pRExC_state = &RExC_state;
6189 #ifdef TRIE_STUDY_OPT
6191 RExC_state_t copyRExC_state;
6193 GET_RE_DEBUG_FLAGS_DECL;
6195 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6197 DEBUG_r(if (!PL_colorset) reginitcolors());
6199 #ifndef PERL_IN_XSUB_RE
6200 /* Initialize these here instead of as-needed, as is quick and avoids
6201 * having to test them each time otherwise */
6202 if (! PL_AboveLatin1) {
6203 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6204 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6205 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6206 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6207 PL_HasMultiCharFold =
6208 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6212 pRExC_state->code_blocks = NULL;
6213 pRExC_state->num_code_blocks = 0;
6216 *is_bare_re = FALSE;
6218 if (expr && (expr->op_type == OP_LIST ||
6219 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6220 /* allocate code_blocks if needed */
6224 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6225 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6226 ncode++; /* count of DO blocks */
6228 pRExC_state->num_code_blocks = ncode;
6229 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6234 /* compile-time pattern with just OP_CONSTs and DO blocks */
6239 /* find how many CONSTs there are */
6242 if (expr->op_type == OP_CONST)
6245 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6246 if (o->op_type == OP_CONST)
6250 /* fake up an SV array */
6252 assert(!new_patternp);
6253 Newx(new_patternp, n, SV*);
6254 SAVEFREEPV(new_patternp);
6258 if (expr->op_type == OP_CONST)
6259 new_patternp[n] = cSVOPx_sv(expr);
6261 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6262 if (o->op_type == OP_CONST)
6263 new_patternp[n++] = cSVOPo_sv;
6268 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6269 "Assembling pattern from %d elements%s\n", pat_count,
6270 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6272 /* set expr to the first arg op */
6274 if (pRExC_state->num_code_blocks
6275 && expr->op_type != OP_CONST)
6277 expr = cLISTOPx(expr)->op_first;
6278 assert( expr->op_type == OP_PUSHMARK
6279 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6280 || expr->op_type == OP_PADRANGE);
6281 expr = expr->op_sibling;
6284 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6285 expr, &recompile, NULL);
6287 /* handle bare (possibly after overloading) regex: foo =~ $re */
6292 if (SvTYPE(re) == SVt_REGEXP) {
6296 Safefree(pRExC_state->code_blocks);
6297 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6298 "Precompiled pattern%s\n",
6299 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6305 exp = SvPV_nomg(pat, plen);
6307 if (!eng->op_comp) {
6308 if ((SvUTF8(pat) && IN_BYTES)
6309 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6311 /* make a temporary copy; either to convert to bytes,
6312 * or to avoid repeating get-magic / overloaded stringify */
6313 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6314 (IN_BYTES ? 0 : SvUTF8(pat)));
6316 Safefree(pRExC_state->code_blocks);
6317 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6320 /* ignore the utf8ness if the pattern is 0 length */
6321 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6322 RExC_uni_semantics = 0;
6323 RExC_contains_locale = 0;
6324 RExC_contains_i = 0;
6325 pRExC_state->runtime_code_qr = NULL;
6328 SV *dsv= sv_newmortal();
6329 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6330 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6331 PL_colors[4],PL_colors[5],s);
6335 /* we jump here if we upgrade the pattern to utf8 and have to
6338 if ((pm_flags & PMf_USE_RE_EVAL)
6339 /* this second condition covers the non-regex literal case,
6340 * i.e. $foo =~ '(?{})'. */
6341 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6343 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6345 /* return old regex if pattern hasn't changed */
6346 /* XXX: note in the below we have to check the flags as well as the
6349 * Things get a touch tricky as we have to compare the utf8 flag
6350 * independently from the compile flags. */
6354 && !!RX_UTF8(old_re) == !!RExC_utf8
6355 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6356 && RX_PRECOMP(old_re)
6357 && RX_PRELEN(old_re) == plen
6358 && memEQ(RX_PRECOMP(old_re), exp, plen)
6359 && !runtime_code /* with runtime code, always recompile */ )
6361 Safefree(pRExC_state->code_blocks);
6365 rx_flags = orig_rx_flags;
6367 if (rx_flags & PMf_FOLD) {
6368 RExC_contains_i = 1;
6370 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6372 /* Set to use unicode semantics if the pattern is in utf8 and has the
6373 * 'depends' charset specified, as it means unicode when utf8 */
6374 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6378 RExC_flags = rx_flags;
6379 RExC_pm_flags = pm_flags;
6382 if (TAINTING_get && TAINT_get)
6383 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6385 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6386 /* whoops, we have a non-utf8 pattern, whilst run-time code
6387 * got compiled as utf8. Try again with a utf8 pattern */
6388 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6389 pRExC_state->num_code_blocks);
6390 goto redo_first_pass;
6393 assert(!pRExC_state->runtime_code_qr);
6399 RExC_in_lookbehind = 0;
6400 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6402 RExC_override_recoding = 0;
6403 RExC_in_multi_char_class = 0;
6405 /* First pass: determine size, legality. */
6408 RExC_end = exp + plen;
6413 RExC_emit = (regnode *) &RExC_emit_dummy;
6414 RExC_whilem_seen = 0;
6415 RExC_open_parens = NULL;
6416 RExC_close_parens = NULL;
6418 RExC_paren_names = NULL;
6420 RExC_paren_name_list = NULL;
6422 RExC_recurse = NULL;
6423 RExC_study_chunk_recursed = NULL;
6424 RExC_study_chunk_recursed_bytes= 0;
6425 RExC_recurse_count = 0;
6426 pRExC_state->code_index = 0;
6428 #if 0 /* REGC() is (currently) a NOP at the first pass.
6429 * Clever compilers notice this and complain. --jhi */
6430 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6433 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6435 RExC_lastparse=NULL;
6437 /* reg may croak on us, not giving us a chance to free
6438 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6439 need it to survive as long as the regexp (qr/(?{})/).
6440 We must check that code_blocksv is not already set, because we may
6441 have jumped back to restart the sizing pass. */
6442 if (pRExC_state->code_blocks && !code_blocksv) {
6443 code_blocksv = newSV_type(SVt_PV);
6444 SAVEFREESV(code_blocksv);
6445 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6446 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6448 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6449 /* It's possible to write a regexp in ascii that represents Unicode
6450 codepoints outside of the byte range, such as via \x{100}. If we
6451 detect such a sequence we have to convert the entire pattern to utf8
6452 and then recompile, as our sizing calculation will have been based
6453 on 1 byte == 1 character, but we will need to use utf8 to encode
6454 at least some part of the pattern, and therefore must convert the whole
6457 if (flags & RESTART_UTF8) {
6458 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6459 pRExC_state->num_code_blocks);
6460 goto redo_first_pass;
6462 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6465 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6468 PerlIO_printf(Perl_debug_log,
6469 "Required size %"IVdf" nodes\n"
6470 "Starting second pass (creation)\n",
6473 RExC_lastparse=NULL;
6476 /* The first pass could have found things that force Unicode semantics */
6477 if ((RExC_utf8 || RExC_uni_semantics)
6478 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6480 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6483 /* Small enough for pointer-storage convention?
6484 If extralen==0, this means that we will not need long jumps. */
6485 if (RExC_size >= 0x10000L && RExC_extralen)
6486 RExC_size += RExC_extralen;
6489 if (RExC_whilem_seen > 15)
6490 RExC_whilem_seen = 15;
6492 /* Allocate space and zero-initialize. Note, the two step process
6493 of zeroing when in debug mode, thus anything assigned has to
6494 happen after that */
6495 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6497 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6498 char, regexp_internal);
6499 if ( r == NULL || ri == NULL )
6500 FAIL("Regexp out of space");
6502 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6503 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6506 /* bulk initialize base fields with 0. */
6507 Zero(ri, sizeof(regexp_internal), char);
6510 /* non-zero initialization begins here */
6513 r->extflags = rx_flags;
6514 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6516 if (pm_flags & PMf_IS_QR) {
6517 ri->code_blocks = pRExC_state->code_blocks;
6518 ri->num_code_blocks = pRExC_state->num_code_blocks;
6523 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6524 if (pRExC_state->code_blocks[n].src_regex)
6525 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6526 SAVEFREEPV(pRExC_state->code_blocks);
6530 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6531 bool has_charset = (get_regex_charset(r->extflags)
6532 != REGEX_DEPENDS_CHARSET);
6534 /* The caret is output if there are any defaults: if not all the STD
6535 * flags are set, or if no character set specifier is needed */
6537 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6539 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6540 == REG_RUN_ON_COMMENT_SEEN);
6541 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6542 >> RXf_PMf_STD_PMMOD_SHIFT);
6543 const char *fptr = STD_PAT_MODS; /*"msix"*/
6545 /* Allocate for the worst case, which is all the std flags are turned
6546 * on. If more precision is desired, we could do a population count of
6547 * the flags set. This could be done with a small lookup table, or by
6548 * shifting, masking and adding, or even, when available, assembly
6549 * language for a machine-language population count.
6550 * We never output a minus, as all those are defaults, so are
6551 * covered by the caret */
6552 const STRLEN wraplen = plen + has_p + has_runon
6553 + has_default /* If needs a caret */
6555 /* If needs a character set specifier */
6556 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6557 + (sizeof(STD_PAT_MODS) - 1)
6558 + (sizeof("(?:)") - 1);
6560 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6561 r->xpv_len_u.xpvlenu_pv = p;
6563 SvFLAGS(rx) |= SVf_UTF8;
6566 /* If a default, cover it using the caret */
6568 *p++= DEFAULT_PAT_MOD;
6572 const char* const name = get_regex_charset_name(r->extflags, &len);
6573 Copy(name, p, len, char);
6577 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6580 while((ch = *fptr++)) {
6588 Copy(RExC_precomp, p, plen, char);
6589 assert ((RX_WRAPPED(rx) - p) < 16);
6590 r->pre_prefix = p - RX_WRAPPED(rx);
6596 SvCUR_set(rx, p - RX_WRAPPED(rx));
6600 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6602 /* setup various meta data about recursion, this all requires
6603 * RExC_npar to be correctly set, and a bit later on we clear it */
6604 if (RExC_seen & REG_RECURSE_SEEN) {
6605 Newxz(RExC_open_parens, RExC_npar,regnode *);
6606 SAVEFREEPV(RExC_open_parens);
6607 Newxz(RExC_close_parens,RExC_npar,regnode *);
6608 SAVEFREEPV(RExC_close_parens);
6610 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6611 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6612 * So its 1 if there are no parens. */
6613 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6614 ((RExC_npar & 0x07) != 0);
6615 Newx(RExC_study_chunk_recursed,
6616 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6617 SAVEFREEPV(RExC_study_chunk_recursed);
6620 /* Useful during FAIL. */
6621 #ifdef RE_TRACK_PATTERN_OFFSETS
6622 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6623 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6624 "%s %"UVuf" bytes for offset annotations.\n",
6625 ri->u.offsets ? "Got" : "Couldn't get",
6626 (UV)((2*RExC_size+1) * sizeof(U32))));
6628 SetProgLen(ri,RExC_size);
6632 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6634 /* Second pass: emit code. */
6635 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6636 RExC_pm_flags = pm_flags;
6638 RExC_end = exp + plen;
6641 RExC_emit_start = ri->program;
6642 RExC_emit = ri->program;
6643 RExC_emit_bound = ri->program + RExC_size + 1;
6644 pRExC_state->code_index = 0;
6646 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6647 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6649 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6651 /* XXXX To minimize changes to RE engine we always allocate
6652 3-units-long substrs field. */
6653 Newx(r->substrs, 1, struct reg_substr_data);
6654 if (RExC_recurse_count) {
6655 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6656 SAVEFREEPV(RExC_recurse);
6660 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6661 Zero(r->substrs, 1, struct reg_substr_data);
6662 if (RExC_study_chunk_recursed)
6663 Zero(RExC_study_chunk_recursed,
6664 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6666 #ifdef TRIE_STUDY_OPT
6668 StructCopy(&zero_scan_data, &data, scan_data_t);
6669 copyRExC_state = RExC_state;
6672 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6674 RExC_state = copyRExC_state;
6675 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6676 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6678 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6679 StructCopy(&zero_scan_data, &data, scan_data_t);
6682 StructCopy(&zero_scan_data, &data, scan_data_t);
6685 /* Dig out information for optimizations. */
6686 r->extflags = RExC_flags; /* was pm_op */
6687 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6690 SvUTF8_on(rx); /* Unicode in it? */
6691 ri->regstclass = NULL;
6692 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6693 r->intflags |= PREGf_NAUGHTY;
6694 scan = ri->program + 1; /* First BRANCH. */
6696 /* testing for BRANCH here tells us whether there is "must appear"
6697 data in the pattern. If there is then we can use it for optimisations */
6698 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6701 STRLEN longest_float_length, longest_fixed_length;
6702 regnode_ssc ch_class; /* pointed to by data */
6704 SSize_t last_close = 0; /* pointed to by data */
6705 regnode *first= scan;
6706 regnode *first_next= regnext(first);
6708 * Skip introductions and multiplicators >= 1
6709 * so that we can extract the 'meat' of the pattern that must
6710 * match in the large if() sequence following.
6711 * NOTE that EXACT is NOT covered here, as it is normally
6712 * picked up by the optimiser separately.
6714 * This is unfortunate as the optimiser isnt handling lookahead
6715 * properly currently.
6718 while ((OP(first) == OPEN && (sawopen = 1)) ||
6719 /* An OR of *one* alternative - should not happen now. */
6720 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6721 /* for now we can't handle lookbehind IFMATCH*/
6722 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6723 (OP(first) == PLUS) ||
6724 (OP(first) == MINMOD) ||
6725 /* An {n,m} with n>0 */
6726 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6727 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6730 * the only op that could be a regnode is PLUS, all the rest
6731 * will be regnode_1 or regnode_2.
6733 * (yves doesn't think this is true)
6735 if (OP(first) == PLUS)
6738 if (OP(first) == MINMOD)
6740 first += regarglen[OP(first)];
6742 first = NEXTOPER(first);
6743 first_next= regnext(first);
6746 /* Starting-point info. */
6748 DEBUG_PEEP("first:",first,0);
6749 /* Ignore EXACT as we deal with it later. */
6750 if (PL_regkind[OP(first)] == EXACT) {
6751 if (OP(first) == EXACT)
6752 NOOP; /* Empty, get anchored substr later. */
6754 ri->regstclass = first;
6757 else if (PL_regkind[OP(first)] == TRIE &&
6758 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6761 /* this can happen only on restudy */
6762 if ( OP(first) == TRIE ) {
6763 struct regnode_1 *trieop = (struct regnode_1 *)
6764 PerlMemShared_calloc(1, sizeof(struct regnode_1));
6765 StructCopy(first,trieop,struct regnode_1);
6766 trie_op=(regnode *)trieop;
6768 struct regnode_charclass *trieop = (struct regnode_charclass *)
6769 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
6770 StructCopy(first,trieop,struct regnode_charclass);
6771 trie_op=(regnode *)trieop;
6774 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
6775 ri->regstclass = trie_op;
6778 else if (REGNODE_SIMPLE(OP(first)))
6779 ri->regstclass = first;
6780 else if (PL_regkind[OP(first)] == BOUND ||
6781 PL_regkind[OP(first)] == NBOUND)
6782 ri->regstclass = first;
6783 else if (PL_regkind[OP(first)] == BOL) {
6784 r->intflags |= (OP(first) == MBOL
6786 : (OP(first) == SBOL
6789 first = NEXTOPER(first);
6792 else if (OP(first) == GPOS) {
6793 r->intflags |= PREGf_ANCH_GPOS;
6794 first = NEXTOPER(first);
6797 else if ((!sawopen || !RExC_sawback) &&
6798 (OP(first) == STAR &&
6799 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6800 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6802 /* turn .* into ^.* with an implied $*=1 */
6804 (OP(NEXTOPER(first)) == REG_ANY)
6807 r->intflags |= (type | PREGf_IMPLICIT);
6808 first = NEXTOPER(first);
6811 if (sawplus && !sawminmod && !sawlookahead
6812 && (!sawopen || !RExC_sawback)
6813 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6814 /* x+ must match at the 1st pos of run of x's */
6815 r->intflags |= PREGf_SKIP;
6817 /* Scan is after the zeroth branch, first is atomic matcher. */
6818 #ifdef TRIE_STUDY_OPT
6821 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6822 (IV)(first - scan + 1))
6826 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6827 (IV)(first - scan + 1))
6833 * If there's something expensive in the r.e., find the
6834 * longest literal string that must appear and make it the
6835 * regmust. Resolve ties in favor of later strings, since
6836 * the regstart check works with the beginning of the r.e.
6837 * and avoiding duplication strengthens checking. Not a
6838 * strong reason, but sufficient in the absence of others.
6839 * [Now we resolve ties in favor of the earlier string if
6840 * it happens that c_offset_min has been invalidated, since the
6841 * earlier string may buy us something the later one won't.]
6844 data.longest_fixed = newSVpvs("");
6845 data.longest_float = newSVpvs("");
6846 data.last_found = newSVpvs("");
6847 data.longest = &(data.longest_fixed);
6848 ENTER_with_name("study_chunk");
6849 SAVEFREESV(data.longest_fixed);
6850 SAVEFREESV(data.longest_float);
6851 SAVEFREESV(data.last_found);
6853 if (!ri->regstclass) {
6854 ssc_init(pRExC_state, &ch_class);
6855 data.start_class = &ch_class;
6856 stclass_flag = SCF_DO_STCLASS_AND;
6857 } else /* XXXX Check for BOUND? */
6859 data.last_closep = &last_close;
6862 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6863 scan + RExC_size, /* Up to end */
6865 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6866 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6870 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6873 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6874 && data.last_start_min == 0 && data.last_end > 0
6875 && !RExC_seen_zerolen
6876 && !(RExC_seen & REG_VERBARG_SEEN)
6877 && !(RExC_seen & REG_GPOS_SEEN)
6879 r->extflags |= RXf_CHECK_ALL;
6881 scan_commit(pRExC_state, &data,&minlen,0);
6883 longest_float_length = CHR_SVLEN(data.longest_float);
6885 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6886 && data.offset_fixed == data.offset_float_min
6887 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6888 && S_setup_longest (aTHX_ pRExC_state,
6892 &(r->float_end_shift),
6893 data.lookbehind_float,
6894 data.offset_float_min,
6896 longest_float_length,
6897 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6898 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6900 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6901 r->float_max_offset = data.offset_float_max;
6902 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6903 r->float_max_offset -= data.lookbehind_float;
6904 SvREFCNT_inc_simple_void_NN(data.longest_float);
6907 r->float_substr = r->float_utf8 = NULL;
6908 longest_float_length = 0;
6911 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6913 if (S_setup_longest (aTHX_ pRExC_state,
6915 &(r->anchored_utf8),
6916 &(r->anchored_substr),
6917 &(r->anchored_end_shift),
6918 data.lookbehind_fixed,
6921 longest_fixed_length,
6922 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6923 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6925 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6926 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6929 r->anchored_substr = r->anchored_utf8 = NULL;
6930 longest_fixed_length = 0;
6932 LEAVE_with_name("study_chunk");
6935 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
6936 ri->regstclass = NULL;
6938 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
6940 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
6941 && !ssc_is_anything(data.start_class))
6943 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
6945 ssc_finalize(pRExC_state, data.start_class);
6947 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
6948 StructCopy(data.start_class,
6949 (regnode_ssc*)RExC_rxi->data->data[n],
6951 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
6952 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
6953 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
6954 regprop(r, sv, (regnode*)data.start_class, NULL);
6955 PerlIO_printf(Perl_debug_log,
6956 "synthetic stclass \"%s\".\n",
6957 SvPVX_const(sv));});
6958 data.start_class = NULL;
6961 /* A temporary algorithm prefers floated substr to fixed one to dig
6963 if (longest_fixed_length > longest_float_length) {
6964 r->substrs->check_ix = 0;
6965 r->check_end_shift = r->anchored_end_shift;
6966 r->check_substr = r->anchored_substr;
6967 r->check_utf8 = r->anchored_utf8;
6968 r->check_offset_min = r->check_offset_max = r->anchored_offset;
6969 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
6970 r->intflags |= PREGf_NOSCAN;
6973 r->substrs->check_ix = 1;
6974 r->check_end_shift = r->float_end_shift;
6975 r->check_substr = r->float_substr;
6976 r->check_utf8 = r->float_utf8;
6977 r->check_offset_min = r->float_min_offset;
6978 r->check_offset_max = r->float_max_offset;
6980 if ((r->check_substr || r->check_utf8) ) {
6981 r->extflags |= RXf_USE_INTUIT;
6982 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
6983 r->extflags |= RXf_INTUIT_TAIL;
6985 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
6987 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
6988 if ( (STRLEN)minlen < longest_float_length )
6989 minlen= longest_float_length;
6990 if ( (STRLEN)minlen < longest_fixed_length )
6991 minlen= longest_fixed_length;
6995 /* Several toplevels. Best we can is to set minlen. */
6997 regnode_ssc ch_class;
6998 SSize_t last_close = 0;
7000 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7002 scan = ri->program + 1;
7003 ssc_init(pRExC_state, &ch_class);
7004 data.start_class = &ch_class;
7005 data.last_closep = &last_close;
7008 minlen = study_chunk(pRExC_state,
7009 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7010 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7011 ? SCF_TRIE_DOING_RESTUDY
7015 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7017 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7018 = r->float_substr = r->float_utf8 = NULL;
7020 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7021 && ! ssc_is_anything(data.start_class))
7023 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7025 ssc_finalize(pRExC_state, data.start_class);
7027 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7028 StructCopy(data.start_class,
7029 (regnode_ssc*)RExC_rxi->data->data[n],
7031 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7032 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7033 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7034 regprop(r, sv, (regnode*)data.start_class, NULL);
7035 PerlIO_printf(Perl_debug_log,
7036 "synthetic stclass \"%s\".\n",
7037 SvPVX_const(sv));});
7038 data.start_class = NULL;
7042 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7043 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7044 r->maxlen = REG_INFTY;
7047 r->maxlen = RExC_maxlen;
7050 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7051 the "real" pattern. */
7053 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7054 (IV)minlen, (IV)r->minlen, RExC_maxlen);
7056 r->minlenret = minlen;
7057 if (r->minlen < minlen)
7060 if (RExC_seen & REG_GPOS_SEEN)
7061 r->intflags |= PREGf_GPOS_SEEN;
7062 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7063 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7065 if (pRExC_state->num_code_blocks)
7066 r->extflags |= RXf_EVAL_SEEN;
7067 if (RExC_seen & REG_CANY_SEEN)
7068 r->intflags |= PREGf_CANY_SEEN;
7069 if (RExC_seen & REG_VERBARG_SEEN)
7071 r->intflags |= PREGf_VERBARG_SEEN;
7072 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7074 if (RExC_seen & REG_CUTGROUP_SEEN)
7075 r->intflags |= PREGf_CUTGROUP_SEEN;
7076 if (pm_flags & PMf_USE_RE_EVAL)
7077 r->intflags |= PREGf_USE_RE_EVAL;
7078 if (RExC_paren_names)
7079 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7081 RXp_PAREN_NAMES(r) = NULL;
7083 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7084 * so it can be used in pp.c */
7085 if (r->intflags & PREGf_ANCH)
7086 r->extflags |= RXf_IS_ANCHORED;
7090 /* this is used to identify "special" patterns that might result
7091 * in Perl NOT calling the regex engine and instead doing the match "itself",
7092 * particularly special cases in split//. By having the regex compiler
7093 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7094 * we avoid weird issues with equivalent patterns resulting in different behavior,
7095 * AND we allow non Perl engines to get the same optimizations by the setting the
7096 * flags appropriately - Yves */
7097 regnode *first = ri->program + 1;
7099 regnode *next = NEXTOPER(first);
7102 if (PL_regkind[fop] == NOTHING && nop == END)
7103 r->extflags |= RXf_NULL;
7104 else if (PL_regkind[fop] == BOL && nop == END)
7105 r->extflags |= RXf_START_ONLY;
7106 else if (fop == PLUS
7107 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7108 && OP(regnext(first)) == END)
7109 r->extflags |= RXf_WHITE;
7110 else if ( r->extflags & RXf_SPLIT
7112 && STR_LEN(first) == 1
7113 && *(STRING(first)) == ' '
7114 && OP(regnext(first)) == END )
7115 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7119 if (RExC_contains_locale) {
7120 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7124 if (RExC_paren_names) {
7125 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7126 ri->data->data[ri->name_list_idx]
7127 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7130 ri->name_list_idx = 0;
7132 if (RExC_recurse_count) {
7133 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7134 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7135 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7138 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7139 /* assume we don't need to swap parens around before we match */
7143 PerlIO_printf(Perl_debug_log,"Final program:\n");
7146 #ifdef RE_TRACK_PATTERN_OFFSETS
7147 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7148 const STRLEN len = ri->u.offsets[0];
7150 GET_RE_DEBUG_FLAGS_DECL;
7151 PerlIO_printf(Perl_debug_log,
7152 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7153 for (i = 1; i <= len; i++) {
7154 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7155 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7156 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7158 PerlIO_printf(Perl_debug_log, "\n");
7163 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7164 * by setting the regexp SV to readonly-only instead. If the
7165 * pattern's been recompiled, the USEDness should remain. */
7166 if (old_re && SvREADONLY(old_re))
7174 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7177 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7179 PERL_UNUSED_ARG(value);
7181 if (flags & RXapif_FETCH) {
7182 return reg_named_buff_fetch(rx, key, flags);
7183 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7184 Perl_croak_no_modify();
7186 } else if (flags & RXapif_EXISTS) {
7187 return reg_named_buff_exists(rx, key, flags)
7190 } else if (flags & RXapif_REGNAMES) {
7191 return reg_named_buff_all(rx, flags);
7192 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7193 return reg_named_buff_scalar(rx, flags);
7195 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7201 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7204 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7205 PERL_UNUSED_ARG(lastkey);
7207 if (flags & RXapif_FIRSTKEY)
7208 return reg_named_buff_firstkey(rx, flags);
7209 else if (flags & RXapif_NEXTKEY)
7210 return reg_named_buff_nextkey(rx, flags);
7212 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7219 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7222 AV *retarray = NULL;
7224 struct regexp *const rx = ReANY(r);
7226 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7228 if (flags & RXapif_ALL)
7231 if (rx && RXp_PAREN_NAMES(rx)) {
7232 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7235 SV* sv_dat=HeVAL(he_str);
7236 I32 *nums=(I32*)SvPVX(sv_dat);
7237 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7238 if ((I32)(rx->nparens) >= nums[i]
7239 && rx->offs[nums[i]].start != -1
7240 && rx->offs[nums[i]].end != -1)
7243 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7248 ret = newSVsv(&PL_sv_undef);
7251 av_push(retarray, ret);
7254 return newRV_noinc(MUTABLE_SV(retarray));
7261 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7264 struct regexp *const rx = ReANY(r);
7266 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7268 if (rx && RXp_PAREN_NAMES(rx)) {
7269 if (flags & RXapif_ALL) {
7270 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7272 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7274 SvREFCNT_dec_NN(sv);
7286 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7288 struct regexp *const rx = ReANY(r);
7290 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7292 if ( rx && RXp_PAREN_NAMES(rx) ) {
7293 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7295 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7302 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7304 struct regexp *const rx = ReANY(r);
7305 GET_RE_DEBUG_FLAGS_DECL;
7307 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7309 if (rx && RXp_PAREN_NAMES(rx)) {
7310 HV *hv = RXp_PAREN_NAMES(rx);
7312 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7315 SV* sv_dat = HeVAL(temphe);
7316 I32 *nums = (I32*)SvPVX(sv_dat);
7317 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7318 if ((I32)(rx->lastparen) >= nums[i] &&
7319 rx->offs[nums[i]].start != -1 &&
7320 rx->offs[nums[i]].end != -1)
7326 if (parno || flags & RXapif_ALL) {
7327 return newSVhek(HeKEY_hek(temphe));
7335 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7340 struct regexp *const rx = ReANY(r);
7342 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7344 if (rx && RXp_PAREN_NAMES(rx)) {
7345 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7346 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7347 } else if (flags & RXapif_ONE) {
7348 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7349 av = MUTABLE_AV(SvRV(ret));
7350 length = av_tindex(av);
7351 SvREFCNT_dec_NN(ret);
7352 return newSViv(length + 1);
7354 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7359 return &PL_sv_undef;
7363 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7365 struct regexp *const rx = ReANY(r);
7368 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7370 if (rx && RXp_PAREN_NAMES(rx)) {
7371 HV *hv= RXp_PAREN_NAMES(rx);
7373 (void)hv_iterinit(hv);
7374 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7377 SV* sv_dat = HeVAL(temphe);
7378 I32 *nums = (I32*)SvPVX(sv_dat);
7379 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7380 if ((I32)(rx->lastparen) >= nums[i] &&
7381 rx->offs[nums[i]].start != -1 &&
7382 rx->offs[nums[i]].end != -1)
7388 if (parno || flags & RXapif_ALL) {
7389 av_push(av, newSVhek(HeKEY_hek(temphe)));
7394 return newRV_noinc(MUTABLE_SV(av));
7398 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7401 struct regexp *const rx = ReANY(r);
7407 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7409 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7410 || n == RX_BUFF_IDX_CARET_FULLMATCH
7411 || n == RX_BUFF_IDX_CARET_POSTMATCH
7414 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7416 /* on something like
7419 * the KEEPCOPY is set on the PMOP rather than the regex */
7420 if (PL_curpm && r == PM_GETRE(PL_curpm))
7421 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7430 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7431 /* no need to distinguish between them any more */
7432 n = RX_BUFF_IDX_FULLMATCH;
7434 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7435 && rx->offs[0].start != -1)
7437 /* $`, ${^PREMATCH} */
7438 i = rx->offs[0].start;
7442 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7443 && rx->offs[0].end != -1)
7445 /* $', ${^POSTMATCH} */
7446 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7447 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7450 if ( 0 <= n && n <= (I32)rx->nparens &&
7451 (s1 = rx->offs[n].start) != -1 &&
7452 (t1 = rx->offs[n].end) != -1)
7454 /* $&, ${^MATCH}, $1 ... */
7456 s = rx->subbeg + s1 - rx->suboffset;
7461 assert(s >= rx->subbeg);
7462 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7464 #ifdef NO_TAINT_SUPPORT
7465 sv_setpvn(sv, s, i);
7467 const int oldtainted = TAINT_get;
7469 sv_setpvn(sv, s, i);
7470 TAINT_set(oldtainted);
7472 if ( (rx->intflags & PREGf_CANY_SEEN)
7473 ? (RXp_MATCH_UTF8(rx)
7474 && (!i || is_utf8_string((U8*)s, i)))
7475 : (RXp_MATCH_UTF8(rx)) )
7482 if (RXp_MATCH_TAINTED(rx)) {
7483 if (SvTYPE(sv) >= SVt_PVMG) {
7484 MAGIC* const mg = SvMAGIC(sv);
7487 SvMAGIC_set(sv, mg->mg_moremagic);
7489 if ((mgt = SvMAGIC(sv))) {
7490 mg->mg_moremagic = mgt;
7491 SvMAGIC_set(sv, mg);
7502 sv_setsv(sv,&PL_sv_undef);
7508 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7509 SV const * const value)
7511 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7513 PERL_UNUSED_ARG(rx);
7514 PERL_UNUSED_ARG(paren);
7515 PERL_UNUSED_ARG(value);
7518 Perl_croak_no_modify();
7522 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7525 struct regexp *const rx = ReANY(r);
7529 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7531 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7532 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7533 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7536 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7538 /* on something like
7541 * the KEEPCOPY is set on the PMOP rather than the regex */
7542 if (PL_curpm && r == PM_GETRE(PL_curpm))
7543 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7549 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7551 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7552 case RX_BUFF_IDX_PREMATCH: /* $` */
7553 if (rx->offs[0].start != -1) {
7554 i = rx->offs[0].start;
7563 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7564 case RX_BUFF_IDX_POSTMATCH: /* $' */
7565 if (rx->offs[0].end != -1) {
7566 i = rx->sublen - rx->offs[0].end;
7568 s1 = rx->offs[0].end;
7575 default: /* $& / ${^MATCH}, $1, $2, ... */
7576 if (paren <= (I32)rx->nparens &&
7577 (s1 = rx->offs[paren].start) != -1 &&
7578 (t1 = rx->offs[paren].end) != -1)
7584 if (ckWARN(WARN_UNINITIALIZED))
7585 report_uninit((const SV *)sv);
7590 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7591 const char * const s = rx->subbeg - rx->suboffset + s1;
7596 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7603 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7605 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7606 PERL_UNUSED_ARG(rx);
7610 return newSVpvs("Regexp");
7613 /* Scans the name of a named buffer from the pattern.
7614 * If flags is REG_RSN_RETURN_NULL returns null.
7615 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7616 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7617 * to the parsed name as looked up in the RExC_paren_names hash.
7618 * If there is an error throws a vFAIL().. type exception.
7621 #define REG_RSN_RETURN_NULL 0
7622 #define REG_RSN_RETURN_NAME 1
7623 #define REG_RSN_RETURN_DATA 2
7626 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7628 char *name_start = RExC_parse;
7630 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7632 assert (RExC_parse <= RExC_end);
7633 if (RExC_parse == RExC_end) NOOP;
7634 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7635 /* skip IDFIRST by using do...while */
7638 RExC_parse += UTF8SKIP(RExC_parse);
7639 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7643 } while (isWORDCHAR(*RExC_parse));
7645 RExC_parse++; /* so the <- from the vFAIL is after the offending
7647 vFAIL("Group name must start with a non-digit word character");
7651 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7652 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7653 if ( flags == REG_RSN_RETURN_NAME)
7655 else if (flags==REG_RSN_RETURN_DATA) {
7658 if ( ! sv_name ) /* should not happen*/
7659 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7660 if (RExC_paren_names)
7661 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7663 sv_dat = HeVAL(he_str);
7665 vFAIL("Reference to nonexistent named group");
7669 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7670 (unsigned long) flags);
7672 assert(0); /* NOT REACHED */
7677 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7678 int rem=(int)(RExC_end - RExC_parse); \
7687 if (RExC_lastparse!=RExC_parse) \
7688 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7691 iscut ? "..." : "<" \
7694 PerlIO_printf(Perl_debug_log,"%16s",""); \
7697 num = RExC_size + 1; \
7699 num=REG_NODE_NUM(RExC_emit); \
7700 if (RExC_lastnum!=num) \
7701 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7703 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7704 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7705 (int)((depth*2)), "", \
7709 RExC_lastparse=RExC_parse; \
7714 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7715 DEBUG_PARSE_MSG((funcname)); \
7716 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7718 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7719 DEBUG_PARSE_MSG((funcname)); \
7720 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7723 /* This section of code defines the inversion list object and its methods. The
7724 * interfaces are highly subject to change, so as much as possible is static to
7725 * this file. An inversion list is here implemented as a malloc'd C UV array
7726 * as an SVt_INVLIST scalar.
7728 * An inversion list for Unicode is an array of code points, sorted by ordinal
7729 * number. The zeroth element is the first code point in the list. The 1th
7730 * element is the first element beyond that not in the list. In other words,
7731 * the first range is
7732 * invlist[0]..(invlist[1]-1)
7733 * The other ranges follow. Thus every element whose index is divisible by two
7734 * marks the beginning of a range that is in the list, and every element not
7735 * divisible by two marks the beginning of a range not in the list. A single
7736 * element inversion list that contains the single code point N generally
7737 * consists of two elements
7740 * (The exception is when N is the highest representable value on the
7741 * machine, in which case the list containing just it would be a single
7742 * element, itself. By extension, if the last range in the list extends to
7743 * infinity, then the first element of that range will be in the inversion list
7744 * at a position that is divisible by two, and is the final element in the
7746 * Taking the complement (inverting) an inversion list is quite simple, if the
7747 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7748 * This implementation reserves an element at the beginning of each inversion
7749 * list to always contain 0; there is an additional flag in the header which
7750 * indicates if the list begins at the 0, or is offset to begin at the next
7753 * More about inversion lists can be found in "Unicode Demystified"
7754 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7755 * More will be coming when functionality is added later.
7757 * The inversion list data structure is currently implemented as an SV pointing
7758 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7759 * array of UV whose memory management is automatically handled by the existing
7760 * facilities for SV's.
7762 * Some of the methods should always be private to the implementation, and some
7763 * should eventually be made public */
7765 /* The header definitions are in F<inline_invlist.c> */
7767 PERL_STATIC_INLINE UV*
7768 S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
7770 /* Returns a pointer to the first element in the inversion list's array.
7771 * This is called upon initialization of an inversion list. Where the
7772 * array begins depends on whether the list has the code point U+0000 in it
7773 * or not. The other parameter tells it whether the code that follows this
7774 * call is about to put a 0 in the inversion list or not. The first
7775 * element is either the element reserved for 0, if TRUE, or the element
7776 * after it, if FALSE */
7778 bool* offset = get_invlist_offset_addr(invlist);
7779 UV* zero_addr = (UV *) SvPVX(invlist);
7781 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7784 assert(! _invlist_len(invlist));
7788 /* 1^1 = 0; 1^0 = 1 */
7789 *offset = 1 ^ will_have_0;
7790 return zero_addr + *offset;
7793 PERL_STATIC_INLINE UV*
7794 S_invlist_array(pTHX_ SV* const invlist)
7796 /* Returns the pointer to the inversion list's array. Every time the
7797 * length changes, this needs to be called in case malloc or realloc moved
7800 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7802 /* Must not be empty. If these fail, you probably didn't check for <len>
7803 * being non-zero before trying to get the array */
7804 assert(_invlist_len(invlist));
7806 /* The very first element always contains zero, The array begins either
7807 * there, or if the inversion list is offset, at the element after it.
7808 * The offset header field determines which; it contains 0 or 1 to indicate
7809 * how much additionally to add */
7810 assert(0 == *(SvPVX(invlist)));
7811 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7814 PERL_STATIC_INLINE void
7815 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7817 /* Sets the current number of elements stored in the inversion list.
7818 * Updates SvCUR correspondingly */
7820 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7822 assert(SvTYPE(invlist) == SVt_INVLIST);
7827 : TO_INTERNAL_SIZE(len + offset));
7828 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7831 PERL_STATIC_INLINE IV*
7832 S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
7834 /* Return the address of the IV that is reserved to hold the cached index
7837 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7839 assert(SvTYPE(invlist) == SVt_INVLIST);
7841 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7844 PERL_STATIC_INLINE IV
7845 S_invlist_previous_index(pTHX_ SV* const invlist)
7847 /* Returns cached index of previous search */
7849 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7851 return *get_invlist_previous_index_addr(invlist);
7854 PERL_STATIC_INLINE void
7855 S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
7857 /* Caches <index> for later retrieval */
7859 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7861 assert(index == 0 || index < (int) _invlist_len(invlist));
7863 *get_invlist_previous_index_addr(invlist) = index;
7866 PERL_STATIC_INLINE UV
7867 S_invlist_max(pTHX_ SV* const invlist)
7869 /* Returns the maximum number of elements storable in the inversion list's
7870 * array, without having to realloc() */
7872 PERL_ARGS_ASSERT_INVLIST_MAX;
7874 assert(SvTYPE(invlist) == SVt_INVLIST);
7876 /* Assumes worst case, in which the 0 element is not counted in the
7877 * inversion list, so subtracts 1 for that */
7878 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7879 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7880 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7883 #ifndef PERL_IN_XSUB_RE
7885 Perl__new_invlist(pTHX_ IV initial_size)
7888 /* Return a pointer to a newly constructed inversion list, with enough
7889 * space to store 'initial_size' elements. If that number is negative, a
7890 * system default is used instead */
7894 if (initial_size < 0) {
7898 /* Allocate the initial space */
7899 new_list = newSV_type(SVt_INVLIST);
7901 /* First 1 is in case the zero element isn't in the list; second 1 is for
7903 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7904 invlist_set_len(new_list, 0, 0);
7906 /* Force iterinit() to be used to get iteration to work */
7907 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7909 *get_invlist_previous_index_addr(new_list) = 0;
7915 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7917 /* Return a pointer to a newly constructed inversion list, initialized to
7918 * point to <list>, which has to be in the exact correct inversion list
7919 * form, including internal fields. Thus this is a dangerous routine that
7920 * should not be used in the wrong hands. The passed in 'list' contains
7921 * several header fields at the beginning that are not part of the
7922 * inversion list body proper */
7924 const STRLEN length = (STRLEN) list[0];
7925 const UV version_id = list[1];
7926 const bool offset = cBOOL(list[2]);
7927 #define HEADER_LENGTH 3
7928 /* If any of the above changes in any way, you must change HEADER_LENGTH
7929 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7930 * perl -E 'say int(rand 2**31-1)'
7932 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
7933 data structure type, so that one being
7934 passed in can be validated to be an
7935 inversion list of the correct vintage.
7938 SV* invlist = newSV_type(SVt_INVLIST);
7940 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
7942 if (version_id != INVLIST_VERSION_ID) {
7943 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
7946 /* The generated array passed in includes header elements that aren't part
7947 * of the list proper, so start it just after them */
7948 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
7950 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
7951 shouldn't touch it */
7953 *(get_invlist_offset_addr(invlist)) = offset;
7955 /* The 'length' passed to us is the physical number of elements in the
7956 * inversion list. But if there is an offset the logical number is one
7958 invlist_set_len(invlist, length - offset, offset);
7960 invlist_set_previous_index(invlist, 0);
7962 /* Initialize the iteration pointer. */
7963 invlist_iterfinish(invlist);
7965 SvREADONLY_on(invlist);
7969 #endif /* ifndef PERL_IN_XSUB_RE */
7972 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
7974 /* Grow the maximum size of an inversion list */
7976 PERL_ARGS_ASSERT_INVLIST_EXTEND;
7978 assert(SvTYPE(invlist) == SVt_INVLIST);
7980 /* Add one to account for the zero element at the beginning which may not
7981 * be counted by the calling parameters */
7982 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
7985 PERL_STATIC_INLINE void
7986 S_invlist_trim(pTHX_ SV* const invlist)
7988 PERL_ARGS_ASSERT_INVLIST_TRIM;
7990 assert(SvTYPE(invlist) == SVt_INVLIST);
7992 /* Change the length of the inversion list to how many entries it currently
7994 SvPV_shrink_to_cur((SV *) invlist);
7998 S__append_range_to_invlist(pTHX_ SV* const invlist,
7999 const UV start, const UV end)
8001 /* Subject to change or removal. Append the range from 'start' to 'end' at
8002 * the end of the inversion list. The range must be above any existing
8006 UV max = invlist_max(invlist);
8007 UV len = _invlist_len(invlist);
8010 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8012 if (len == 0) { /* Empty lists must be initialized */
8013 offset = start != 0;
8014 array = _invlist_array_init(invlist, ! offset);
8017 /* Here, the existing list is non-empty. The current max entry in the
8018 * list is generally the first value not in the set, except when the
8019 * set extends to the end of permissible values, in which case it is
8020 * the first entry in that final set, and so this call is an attempt to
8021 * append out-of-order */
8023 UV final_element = len - 1;
8024 array = invlist_array(invlist);
8025 if (array[final_element] > start
8026 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8028 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8029 array[final_element], start,
8030 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8033 /* Here, it is a legal append. If the new range begins with the first
8034 * value not in the set, it is extending the set, so the new first
8035 * value not in the set is one greater than the newly extended range.
8037 offset = *get_invlist_offset_addr(invlist);
8038 if (array[final_element] == start) {
8039 if (end != UV_MAX) {
8040 array[final_element] = end + 1;
8043 /* But if the end is the maximum representable on the machine,
8044 * just let the range that this would extend to have no end */
8045 invlist_set_len(invlist, len - 1, offset);
8051 /* Here the new range doesn't extend any existing set. Add it */
8053 len += 2; /* Includes an element each for the start and end of range */
8055 /* If wll overflow the existing space, extend, which may cause the array to
8058 invlist_extend(invlist, len);
8060 /* Have to set len here to avoid assert failure in invlist_array() */
8061 invlist_set_len(invlist, len, offset);
8063 array = invlist_array(invlist);
8066 invlist_set_len(invlist, len, offset);
8069 /* The next item on the list starts the range, the one after that is
8070 * one past the new range. */
8071 array[len - 2] = start;
8072 if (end != UV_MAX) {
8073 array[len - 1] = end + 1;
8076 /* But if the end is the maximum representable on the machine, just let
8077 * the range have no end */
8078 invlist_set_len(invlist, len - 1, offset);
8082 #ifndef PERL_IN_XSUB_RE
8085 Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
8087 /* Searches the inversion list for the entry that contains the input code
8088 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8089 * return value is the index into the list's array of the range that
8094 IV high = _invlist_len(invlist);
8095 const IV highest_element = high - 1;
8098 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8100 /* If list is empty, return failure. */
8105 /* (We can't get the array unless we know the list is non-empty) */
8106 array = invlist_array(invlist);
8108 mid = invlist_previous_index(invlist);
8109 assert(mid >=0 && mid <= highest_element);
8111 /* <mid> contains the cache of the result of the previous call to this
8112 * function (0 the first time). See if this call is for the same result,
8113 * or if it is for mid-1. This is under the theory that calls to this
8114 * function will often be for related code points that are near each other.
8115 * And benchmarks show that caching gives better results. We also test
8116 * here if the code point is within the bounds of the list. These tests
8117 * replace others that would have had to be made anyway to make sure that
8118 * the array bounds were not exceeded, and these give us extra information
8119 * at the same time */
8120 if (cp >= array[mid]) {
8121 if (cp >= array[highest_element]) {
8122 return highest_element;
8125 /* Here, array[mid] <= cp < array[highest_element]. This means that
8126 * the final element is not the answer, so can exclude it; it also
8127 * means that <mid> is not the final element, so can refer to 'mid + 1'
8129 if (cp < array[mid + 1]) {
8135 else { /* cp < aray[mid] */
8136 if (cp < array[0]) { /* Fail if outside the array */
8140 if (cp >= array[mid - 1]) {
8145 /* Binary search. What we are looking for is <i> such that
8146 * array[i] <= cp < array[i+1]
8147 * The loop below converges on the i+1. Note that there may not be an
8148 * (i+1)th element in the array, and things work nonetheless */
8149 while (low < high) {
8150 mid = (low + high) / 2;
8151 assert(mid <= highest_element);
8152 if (array[mid] <= cp) { /* cp >= array[mid] */
8155 /* We could do this extra test to exit the loop early.
8156 if (cp < array[low]) {
8161 else { /* cp < array[mid] */
8168 invlist_set_previous_index(invlist, high);
8173 Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
8174 const UV start, const UV end, U8* swatch)
8176 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8177 * but is used when the swash has an inversion list. This makes this much
8178 * faster, as it uses a binary search instead of a linear one. This is
8179 * intimately tied to that function, and perhaps should be in utf8.c,
8180 * except it is intimately tied to inversion lists as well. It assumes
8181 * that <swatch> is all 0's on input */
8184 const IV len = _invlist_len(invlist);
8188 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8190 if (len == 0) { /* Empty inversion list */
8194 array = invlist_array(invlist);
8196 /* Find which element it is */
8197 i = _invlist_search(invlist, start);
8199 /* We populate from <start> to <end> */
8200 while (current < end) {
8203 /* The inversion list gives the results for every possible code point
8204 * after the first one in the list. Only those ranges whose index is
8205 * even are ones that the inversion list matches. For the odd ones,
8206 * and if the initial code point is not in the list, we have to skip
8207 * forward to the next element */
8208 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8210 if (i >= len) { /* Finished if beyond the end of the array */
8214 if (current >= end) { /* Finished if beyond the end of what we
8216 if (LIKELY(end < UV_MAX)) {
8220 /* We get here when the upper bound is the maximum
8221 * representable on the machine, and we are looking for just
8222 * that code point. Have to special case it */
8224 goto join_end_of_list;
8227 assert(current >= start);
8229 /* The current range ends one below the next one, except don't go past
8232 upper = (i < len && array[i] < end) ? array[i] : end;
8234 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8235 * for each code point in it */
8236 for (; current < upper; current++) {
8237 const STRLEN offset = (STRLEN)(current - start);
8238 swatch[offset >> 3] |= 1 << (offset & 7);
8243 /* Quit if at the end of the list */
8246 /* But first, have to deal with the highest possible code point on
8247 * the platform. The previous code assumes that <end> is one
8248 * beyond where we want to populate, but that is impossible at the
8249 * platform's infinity, so have to handle it specially */
8250 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8252 const STRLEN offset = (STRLEN)(end - start);
8253 swatch[offset >> 3] |= 1 << (offset & 7);
8258 /* Advance to the next range, which will be for code points not in the
8267 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8268 const bool complement_b, SV** output)
8270 /* Take the union of two inversion lists and point <output> to it. *output
8271 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8272 * the reference count to that list will be decremented if not already a
8273 * temporary (mortal); otherwise *output will be made correspondingly
8274 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8275 * second list is returned. If <complement_b> is TRUE, the union is taken
8276 * of the complement (inversion) of <b> instead of b itself.
8278 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8279 * Richard Gillam, published by Addison-Wesley, and explained at some
8280 * length there. The preface says to incorporate its examples into your
8281 * code at your own risk.
8283 * The algorithm is like a merge sort.
8285 * XXX A potential performance improvement is to keep track as we go along
8286 * if only one of the inputs contributes to the result, meaning the other
8287 * is a subset of that one. In that case, we can skip the final copy and
8288 * return the larger of the input lists, but then outside code might need
8289 * to keep track of whether to free the input list or not */
8291 const UV* array_a; /* a's array */
8293 UV len_a; /* length of a's array */
8296 SV* u; /* the resulting union */
8300 UV i_a = 0; /* current index into a's array */
8304 /* running count, as explained in the algorithm source book; items are
8305 * stopped accumulating and are output when the count changes to/from 0.
8306 * The count is incremented when we start a range that's in the set, and
8307 * decremented when we start a range that's not in the set. So its range
8308 * is 0 to 2. Only when the count is zero is something not in the set.
8312 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8315 /* If either one is empty, the union is the other one */
8316 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8317 bool make_temp = FALSE; /* Should we mortalize the result? */
8321 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8327 *output = invlist_clone(b);
8329 _invlist_invert(*output);
8331 } /* else *output already = b; */
8334 sv_2mortal(*output);
8338 else if ((len_b = _invlist_len(b)) == 0) {
8339 bool make_temp = FALSE;
8341 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8346 /* The complement of an empty list is a list that has everything in it,
8347 * so the union with <a> includes everything too */
8350 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8354 *output = _new_invlist(1);
8355 _append_range_to_invlist(*output, 0, UV_MAX);
8357 else if (*output != a) {
8358 *output = invlist_clone(a);
8360 /* else *output already = a; */
8363 sv_2mortal(*output);
8368 /* Here both lists exist and are non-empty */
8369 array_a = invlist_array(a);
8370 array_b = invlist_array(b);
8372 /* If are to take the union of 'a' with the complement of b, set it
8373 * up so are looking at b's complement. */
8376 /* To complement, we invert: if the first element is 0, remove it. To
8377 * do this, we just pretend the array starts one later */
8378 if (array_b[0] == 0) {
8384 /* But if the first element is not zero, we pretend the list starts
8385 * at the 0 that is always stored immediately before the array. */
8391 /* Size the union for the worst case: that the sets are completely
8393 u = _new_invlist(len_a + len_b);
8395 /* Will contain U+0000 if either component does */
8396 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8397 || (len_b > 0 && array_b[0] == 0));
8399 /* Go through each list item by item, stopping when exhausted one of
8401 while (i_a < len_a && i_b < len_b) {
8402 UV cp; /* The element to potentially add to the union's array */
8403 bool cp_in_set; /* is it in the the input list's set or not */
8405 /* We need to take one or the other of the two inputs for the union.
8406 * Since we are merging two sorted lists, we take the smaller of the
8407 * next items. In case of a tie, we take the one that is in its set
8408 * first. If we took one not in the set first, it would decrement the
8409 * count, possibly to 0 which would cause it to be output as ending the
8410 * range, and the next time through we would take the same number, and
8411 * output it again as beginning the next range. By doing it the
8412 * opposite way, there is no possibility that the count will be
8413 * momentarily decremented to 0, and thus the two adjoining ranges will
8414 * be seamlessly merged. (In a tie and both are in the set or both not
8415 * in the set, it doesn't matter which we take first.) */
8416 if (array_a[i_a] < array_b[i_b]
8417 || (array_a[i_a] == array_b[i_b]
8418 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8420 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8424 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8425 cp = array_b[i_b++];
8428 /* Here, have chosen which of the two inputs to look at. Only output
8429 * if the running count changes to/from 0, which marks the
8430 * beginning/end of a range in that's in the set */
8433 array_u[i_u++] = cp;
8440 array_u[i_u++] = cp;
8445 /* Here, we are finished going through at least one of the lists, which
8446 * means there is something remaining in at most one. We check if the list
8447 * that hasn't been exhausted is positioned such that we are in the middle
8448 * of a range in its set or not. (i_a and i_b point to the element beyond
8449 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8450 * is potentially more to output.
8451 * There are four cases:
8452 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8453 * in the union is entirely from the non-exhausted set.
8454 * 2) Both were in their sets, count is 2. Nothing further should
8455 * be output, as everything that remains will be in the exhausted
8456 * list's set, hence in the union; decrementing to 1 but not 0 insures
8458 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8459 * Nothing further should be output because the union includes
8460 * everything from the exhausted set. Not decrementing ensures that.
8461 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8462 * decrementing to 0 insures that we look at the remainder of the
8463 * non-exhausted set */
8464 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8465 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8470 /* The final length is what we've output so far, plus what else is about to
8471 * be output. (If 'count' is non-zero, then the input list we exhausted
8472 * has everything remaining up to the machine's limit in its set, and hence
8473 * in the union, so there will be no further output. */
8476 /* At most one of the subexpressions will be non-zero */
8477 len_u += (len_a - i_a) + (len_b - i_b);
8480 /* Set result to final length, which can change the pointer to array_u, so
8482 if (len_u != _invlist_len(u)) {
8483 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8485 array_u = invlist_array(u);
8488 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8489 * the other) ended with everything above it not in its set. That means
8490 * that the remaining part of the union is precisely the same as the
8491 * non-exhausted list, so can just copy it unchanged. (If both list were
8492 * exhausted at the same time, then the operations below will be both 0.)
8495 IV copy_count; /* At most one will have a non-zero copy count */
8496 if ((copy_count = len_a - i_a) > 0) {
8497 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8499 else if ((copy_count = len_b - i_b) > 0) {
8500 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8504 /* We may be removing a reference to one of the inputs. If so, the output
8505 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8506 * count decremented) */
8507 if (a == *output || b == *output) {
8508 assert(! invlist_is_iterating(*output));
8509 if ((SvTEMP(*output))) {
8513 SvREFCNT_dec_NN(*output);
8523 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8524 const bool complement_b, SV** i)
8526 /* Take the intersection of two inversion lists and point <i> to it. *i
8527 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8528 * the reference count to that list will be decremented if not already a
8529 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8530 * The first list, <a>, may be NULL, in which case an empty list is
8531 * returned. If <complement_b> is TRUE, the result will be the
8532 * intersection of <a> and the complement (or inversion) of <b> instead of
8535 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8536 * Richard Gillam, published by Addison-Wesley, and explained at some
8537 * length there. The preface says to incorporate its examples into your
8538 * code at your own risk. In fact, it had bugs
8540 * The algorithm is like a merge sort, and is essentially the same as the
8544 const UV* array_a; /* a's array */
8546 UV len_a; /* length of a's array */
8549 SV* r; /* the resulting intersection */
8553 UV i_a = 0; /* current index into a's array */
8557 /* running count, as explained in the algorithm source book; items are
8558 * stopped accumulating and are output when the count changes to/from 2.
8559 * The count is incremented when we start a range that's in the set, and
8560 * decremented when we start a range that's not in the set. So its range
8561 * is 0 to 2. Only when the count is 2 is something in the intersection.
8565 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8568 /* Special case if either one is empty */
8569 len_a = (a == NULL) ? 0 : _invlist_len(a);
8570 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8571 bool make_temp = FALSE;
8573 if (len_a != 0 && complement_b) {
8575 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8576 * be empty. Here, also we are using 'b's complement, which hence
8577 * must be every possible code point. Thus the intersection is
8581 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8586 *i = invlist_clone(a);
8588 /* else *i is already 'a' */
8596 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8597 * intersection must be empty */
8599 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8604 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8608 *i = _new_invlist(0);
8616 /* Here both lists exist and are non-empty */
8617 array_a = invlist_array(a);
8618 array_b = invlist_array(b);
8620 /* If are to take the intersection of 'a' with the complement of b, set it
8621 * up so are looking at b's complement. */
8624 /* To complement, we invert: if the first element is 0, remove it. To
8625 * do this, we just pretend the array starts one later */
8626 if (array_b[0] == 0) {
8632 /* But if the first element is not zero, we pretend the list starts
8633 * at the 0 that is always stored immediately before the array. */
8639 /* Size the intersection for the worst case: that the intersection ends up
8640 * fragmenting everything to be completely disjoint */
8641 r= _new_invlist(len_a + len_b);
8643 /* Will contain U+0000 iff both components do */
8644 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8645 && len_b > 0 && array_b[0] == 0);
8647 /* Go through each list item by item, stopping when exhausted one of
8649 while (i_a < len_a && i_b < len_b) {
8650 UV cp; /* The element to potentially add to the intersection's
8652 bool cp_in_set; /* Is it in the input list's set or not */
8654 /* We need to take one or the other of the two inputs for the
8655 * intersection. Since we are merging two sorted lists, we take the
8656 * smaller of the next items. In case of a tie, we take the one that
8657 * is not in its set first (a difference from the union algorithm). If
8658 * we took one in the set first, it would increment the count, possibly
8659 * to 2 which would cause it to be output as starting a range in the
8660 * intersection, and the next time through we would take that same
8661 * number, and output it again as ending the set. By doing it the
8662 * opposite of this, there is no possibility that the count will be
8663 * momentarily incremented to 2. (In a tie and both are in the set or
8664 * both not in the set, it doesn't matter which we take first.) */
8665 if (array_a[i_a] < array_b[i_b]
8666 || (array_a[i_a] == array_b[i_b]
8667 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8669 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8673 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8677 /* Here, have chosen which of the two inputs to look at. Only output
8678 * if the running count changes to/from 2, which marks the
8679 * beginning/end of a range that's in the intersection */
8683 array_r[i_r++] = cp;
8688 array_r[i_r++] = cp;
8694 /* Here, we are finished going through at least one of the lists, which
8695 * means there is something remaining in at most one. We check if the list
8696 * that has been exhausted is positioned such that we are in the middle
8697 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8698 * the ones we care about.) There are four cases:
8699 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8700 * nothing left in the intersection.
8701 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8702 * above 2. What should be output is exactly that which is in the
8703 * non-exhausted set, as everything it has is also in the intersection
8704 * set, and everything it doesn't have can't be in the intersection
8705 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8706 * gets incremented to 2. Like the previous case, the intersection is
8707 * everything that remains in the non-exhausted set.
8708 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8709 * remains 1. And the intersection has nothing more. */
8710 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8711 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8716 /* The final length is what we've output so far plus what else is in the
8717 * intersection. At most one of the subexpressions below will be non-zero
8721 len_r += (len_a - i_a) + (len_b - i_b);
8724 /* Set result to final length, which can change the pointer to array_r, so
8726 if (len_r != _invlist_len(r)) {
8727 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8729 array_r = invlist_array(r);
8732 /* Finish outputting any remaining */
8733 if (count >= 2) { /* At most one will have a non-zero copy count */
8735 if ((copy_count = len_a - i_a) > 0) {
8736 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8738 else if ((copy_count = len_b - i_b) > 0) {
8739 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8743 /* We may be removing a reference to one of the inputs. If so, the output
8744 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8745 * count decremented) */
8746 if (a == *i || b == *i) {
8747 assert(! invlist_is_iterating(*i));
8752 SvREFCNT_dec_NN(*i);
8762 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8764 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8765 * set. A pointer to the inversion list is returned. This may actually be
8766 * a new list, in which case the passed in one has been destroyed. The
8767 * passed in inversion list can be NULL, in which case a new one is created
8768 * with just the one range in it */
8773 if (invlist == NULL) {
8774 invlist = _new_invlist(2);
8778 len = _invlist_len(invlist);
8781 /* If comes after the final entry actually in the list, can just append it
8784 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8785 && start >= invlist_array(invlist)[len - 1]))
8787 _append_range_to_invlist(invlist, start, end);
8791 /* Here, can't just append things, create and return a new inversion list
8792 * which is the union of this range and the existing inversion list */
8793 range_invlist = _new_invlist(2);
8794 _append_range_to_invlist(range_invlist, start, end);
8796 _invlist_union(invlist, range_invlist, &invlist);
8798 /* The temporary can be freed */
8799 SvREFCNT_dec_NN(range_invlist);
8805 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8806 UV** other_elements_ptr)
8808 /* Create and return an inversion list whose contents are to be populated
8809 * by the caller. The caller gives the number of elements (in 'size') and
8810 * the very first element ('element0'). This function will set
8811 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8814 * Obviously there is some trust involved that the caller will properly
8815 * fill in the other elements of the array.
8817 * (The first element needs to be passed in, as the underlying code does
8818 * things differently depending on whether it is zero or non-zero) */
8820 SV* invlist = _new_invlist(size);
8823 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8825 _append_range_to_invlist(invlist, element0, element0);
8826 offset = *get_invlist_offset_addr(invlist);
8828 invlist_set_len(invlist, size, offset);
8829 *other_elements_ptr = invlist_array(invlist) + 1;
8835 PERL_STATIC_INLINE SV*
8836 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8837 return _add_range_to_invlist(invlist, cp, cp);
8840 #ifndef PERL_IN_XSUB_RE
8842 Perl__invlist_invert(pTHX_ SV* const invlist)
8844 /* Complement the input inversion list. This adds a 0 if the list didn't
8845 * have a zero; removes it otherwise. As described above, the data
8846 * structure is set up so that this is very efficient */
8848 PERL_ARGS_ASSERT__INVLIST_INVERT;
8850 assert(! invlist_is_iterating(invlist));
8852 /* The inverse of matching nothing is matching everything */
8853 if (_invlist_len(invlist) == 0) {
8854 _append_range_to_invlist(invlist, 0, UV_MAX);
8858 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8863 PERL_STATIC_INLINE SV*
8864 S_invlist_clone(pTHX_ SV* const invlist)
8867 /* Return a new inversion list that is a copy of the input one, which is
8868 * unchanged. The new list will not be mortal even if the old one was. */
8870 /* Need to allocate extra space to accommodate Perl's addition of a
8871 * trailing NUL to SvPV's, since it thinks they are always strings */
8872 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8873 STRLEN physical_length = SvCUR(invlist);
8874 bool offset = *(get_invlist_offset_addr(invlist));
8876 PERL_ARGS_ASSERT_INVLIST_CLONE;
8878 *(get_invlist_offset_addr(new_invlist)) = offset;
8879 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8880 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8885 PERL_STATIC_INLINE STRLEN*
8886 S_get_invlist_iter_addr(pTHX_ SV* invlist)
8888 /* Return the address of the UV that contains the current iteration
8891 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8893 assert(SvTYPE(invlist) == SVt_INVLIST);
8895 return &(((XINVLIST*) SvANY(invlist))->iterator);
8898 PERL_STATIC_INLINE void
8899 S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
8901 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8903 *get_invlist_iter_addr(invlist) = 0;
8906 PERL_STATIC_INLINE void
8907 S_invlist_iterfinish(pTHX_ SV* invlist)
8909 /* Terminate iterator for invlist. This is to catch development errors.
8910 * Any iteration that is interrupted before completed should call this
8911 * function. Functions that add code points anywhere else but to the end
8912 * of an inversion list assert that they are not in the middle of an
8913 * iteration. If they were, the addition would make the iteration
8914 * problematical: if the iteration hadn't reached the place where things
8915 * were being added, it would be ok */
8917 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8919 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8923 S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
8925 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8926 * This call sets in <*start> and <*end>, the next range in <invlist>.
8927 * Returns <TRUE> if successful and the next call will return the next
8928 * range; <FALSE> if was already at the end of the list. If the latter,
8929 * <*start> and <*end> are unchanged, and the next call to this function
8930 * will start over at the beginning of the list */
8932 STRLEN* pos = get_invlist_iter_addr(invlist);
8933 UV len = _invlist_len(invlist);
8936 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
8939 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
8943 array = invlist_array(invlist);
8945 *start = array[(*pos)++];
8951 *end = array[(*pos)++] - 1;
8957 PERL_STATIC_INLINE bool
8958 S_invlist_is_iterating(pTHX_ SV* const invlist)
8960 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8962 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8965 PERL_STATIC_INLINE UV
8966 S_invlist_highest(pTHX_ SV* const invlist)
8968 /* Returns the highest code point that matches an inversion list. This API
8969 * has an ambiguity, as it returns 0 under either the highest is actually
8970 * 0, or if the list is empty. If this distinction matters to you, check
8971 * for emptiness before calling this function */
8973 UV len = _invlist_len(invlist);
8976 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
8982 array = invlist_array(invlist);
8984 /* The last element in the array in the inversion list always starts a
8985 * range that goes to infinity. That range may be for code points that are
8986 * matched in the inversion list, or it may be for ones that aren't
8987 * matched. In the latter case, the highest code point in the set is one
8988 * less than the beginning of this range; otherwise it is the final element
8989 * of this range: infinity */
8990 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
8992 : array[len - 1] - 1;
8995 #ifndef PERL_IN_XSUB_RE
8997 Perl__invlist_contents(pTHX_ SV* const invlist)
8999 /* Get the contents of an inversion list into a string SV so that they can
9000 * be printed out. It uses the format traditionally done for debug tracing
9004 SV* output = newSVpvs("\n");
9006 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9008 assert(! invlist_is_iterating(invlist));
9010 invlist_iterinit(invlist);
9011 while (invlist_iternext(invlist, &start, &end)) {
9012 if (end == UV_MAX) {
9013 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9015 else if (end != start) {
9016 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9020 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9028 #ifndef PERL_IN_XSUB_RE
9030 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9031 const char * const indent, SV* const invlist)
9033 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9034 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9035 * the string 'indent'. The output looks like this:
9036 [0] 0x000A .. 0x000D
9038 [4] 0x2028 .. 0x2029
9039 [6] 0x3104 .. INFINITY
9040 * This means that the first range of code points matched by the list are
9041 * 0xA through 0xD; the second range contains only the single code point
9042 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9043 * are used to define each range (except if the final range extends to
9044 * infinity, only a single element is needed). The array index of the
9045 * first element for the corresponding range is given in brackets. */
9050 PERL_ARGS_ASSERT__INVLIST_DUMP;
9052 if (invlist_is_iterating(invlist)) {
9053 Perl_dump_indent(aTHX_ level, file,
9054 "%sCan't dump inversion list because is in middle of iterating\n",
9059 invlist_iterinit(invlist);
9060 while (invlist_iternext(invlist, &start, &end)) {
9061 if (end == UV_MAX) {
9062 Perl_dump_indent(aTHX_ level, file,
9063 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9064 indent, (UV)count, start);
9066 else if (end != start) {
9067 Perl_dump_indent(aTHX_ level, file,
9068 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9069 indent, (UV)count, start, end);
9072 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9073 indent, (UV)count, start);
9080 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9082 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9084 /* Return a boolean as to if the two passed in inversion lists are
9085 * identical. The final argument, if TRUE, says to take the complement of
9086 * the second inversion list before doing the comparison */
9088 const UV* array_a = invlist_array(a);
9089 const UV* array_b = invlist_array(b);
9090 UV len_a = _invlist_len(a);
9091 UV len_b = _invlist_len(b);
9093 UV i = 0; /* current index into the arrays */
9094 bool retval = TRUE; /* Assume are identical until proven otherwise */
9096 PERL_ARGS_ASSERT__INVLISTEQ;
9098 /* If are to compare 'a' with the complement of b, set it
9099 * up so are looking at b's complement. */
9102 /* The complement of nothing is everything, so <a> would have to have
9103 * just one element, starting at zero (ending at infinity) */
9105 return (len_a == 1 && array_a[0] == 0);
9107 else if (array_b[0] == 0) {
9109 /* Otherwise, to complement, we invert. Here, the first element is
9110 * 0, just remove it. To do this, we just pretend the array starts
9118 /* But if the first element is not zero, we pretend the list starts
9119 * at the 0 that is always stored immediately before the array. */
9125 /* Make sure that the lengths are the same, as well as the final element
9126 * before looping through the remainder. (Thus we test the length, final,
9127 * and first elements right off the bat) */
9128 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9131 else for (i = 0; i < len_a - 1; i++) {
9132 if (array_a[i] != array_b[i]) {
9142 #undef HEADER_LENGTH
9143 #undef TO_INTERNAL_SIZE
9144 #undef FROM_INTERNAL_SIZE
9145 #undef INVLIST_VERSION_ID
9147 /* End of inversion list object */
9150 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9152 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9153 * constructs, and updates RExC_flags with them. On input, RExC_parse
9154 * should point to the first flag; it is updated on output to point to the
9155 * final ')' or ':'. There needs to be at least one flag, or this will
9158 /* for (?g), (?gc), and (?o) warnings; warning
9159 about (?c) will warn about (?g) -- japhy */
9161 #define WASTED_O 0x01
9162 #define WASTED_G 0x02
9163 #define WASTED_C 0x04
9164 #define WASTED_GC (WASTED_G|WASTED_C)
9165 I32 wastedflags = 0x00;
9166 U32 posflags = 0, negflags = 0;
9167 U32 *flagsp = &posflags;
9168 char has_charset_modifier = '\0';
9170 bool has_use_defaults = FALSE;
9171 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9173 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9175 /* '^' as an initial flag sets certain defaults */
9176 if (UCHARAT(RExC_parse) == '^') {
9178 has_use_defaults = TRUE;
9179 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9180 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9181 ? REGEX_UNICODE_CHARSET
9182 : REGEX_DEPENDS_CHARSET);
9185 cs = get_regex_charset(RExC_flags);
9186 if (cs == REGEX_DEPENDS_CHARSET
9187 && (RExC_utf8 || RExC_uni_semantics))
9189 cs = REGEX_UNICODE_CHARSET;
9192 while (*RExC_parse) {
9193 /* && strchr("iogcmsx", *RExC_parse) */
9194 /* (?g), (?gc) and (?o) are useless here
9195 and must be globally applied -- japhy */
9196 switch (*RExC_parse) {
9198 /* Code for the imsx flags */
9199 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9201 case LOCALE_PAT_MOD:
9202 if (has_charset_modifier) {
9203 goto excess_modifier;
9205 else if (flagsp == &negflags) {
9208 cs = REGEX_LOCALE_CHARSET;
9209 has_charset_modifier = LOCALE_PAT_MOD;
9211 case UNICODE_PAT_MOD:
9212 if (has_charset_modifier) {
9213 goto excess_modifier;
9215 else if (flagsp == &negflags) {
9218 cs = REGEX_UNICODE_CHARSET;
9219 has_charset_modifier = UNICODE_PAT_MOD;
9221 case ASCII_RESTRICT_PAT_MOD:
9222 if (flagsp == &negflags) {
9225 if (has_charset_modifier) {
9226 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9227 goto excess_modifier;
9229 /* Doubled modifier implies more restricted */
9230 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9233 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9235 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9237 case DEPENDS_PAT_MOD:
9238 if (has_use_defaults) {
9239 goto fail_modifiers;
9241 else if (flagsp == &negflags) {
9244 else if (has_charset_modifier) {
9245 goto excess_modifier;
9248 /* The dual charset means unicode semantics if the
9249 * pattern (or target, not known until runtime) are
9250 * utf8, or something in the pattern indicates unicode
9252 cs = (RExC_utf8 || RExC_uni_semantics)
9253 ? REGEX_UNICODE_CHARSET
9254 : REGEX_DEPENDS_CHARSET;
9255 has_charset_modifier = DEPENDS_PAT_MOD;
9259 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9260 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9262 else if (has_charset_modifier == *(RExC_parse - 1)) {
9263 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9267 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9272 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9275 case ONCE_PAT_MOD: /* 'o' */
9276 case GLOBAL_PAT_MOD: /* 'g' */
9277 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9278 const I32 wflagbit = *RExC_parse == 'o'
9281 if (! (wastedflags & wflagbit) ) {
9282 wastedflags |= wflagbit;
9283 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9286 "Useless (%s%c) - %suse /%c modifier",
9287 flagsp == &negflags ? "?-" : "?",
9289 flagsp == &negflags ? "don't " : "",
9296 case CONTINUE_PAT_MOD: /* 'c' */
9297 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9298 if (! (wastedflags & WASTED_C) ) {
9299 wastedflags |= WASTED_GC;
9300 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9303 "Useless (%sc) - %suse /gc modifier",
9304 flagsp == &negflags ? "?-" : "?",
9305 flagsp == &negflags ? "don't " : ""
9310 case KEEPCOPY_PAT_MOD: /* 'p' */
9311 if (flagsp == &negflags) {
9313 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9315 *flagsp |= RXf_PMf_KEEPCOPY;
9319 /* A flag is a default iff it is following a minus, so
9320 * if there is a minus, it means will be trying to
9321 * re-specify a default which is an error */
9322 if (has_use_defaults || flagsp == &negflags) {
9323 goto fail_modifiers;
9326 wastedflags = 0; /* reset so (?g-c) warns twice */
9330 RExC_flags |= posflags;
9331 RExC_flags &= ~negflags;
9332 set_regex_charset(&RExC_flags, cs);
9333 if (RExC_flags & RXf_PMf_FOLD) {
9334 RExC_contains_i = 1;
9340 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9341 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9342 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9343 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9352 - reg - regular expression, i.e. main body or parenthesized thing
9354 * Caller must absorb opening parenthesis.
9356 * Combining parenthesis handling with the base level of regular expression
9357 * is a trifle forced, but the need to tie the tails of the branches to what
9358 * follows makes it hard to avoid.
9360 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9362 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9364 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9367 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9368 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9369 needs to be restarted.
9370 Otherwise would only return NULL if regbranch() returns NULL, which
9373 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9374 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9375 * 2 is like 1, but indicates that nextchar() has been called to advance
9376 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9377 * this flag alerts us to the need to check for that */
9380 regnode *ret; /* Will be the head of the group. */
9383 regnode *ender = NULL;
9386 U32 oregflags = RExC_flags;
9387 bool have_branch = 0;
9389 I32 freeze_paren = 0;
9390 I32 after_freeze = 0;
9392 char * parse_start = RExC_parse; /* MJD */
9393 char * const oregcomp_parse = RExC_parse;
9395 GET_RE_DEBUG_FLAGS_DECL;
9397 PERL_ARGS_ASSERT_REG;
9398 DEBUG_PARSE("reg ");
9400 *flagp = 0; /* Tentatively. */
9403 /* Make an OPEN node, if parenthesized. */
9406 /* Under /x, space and comments can be gobbled up between the '(' and
9407 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9408 * intervening space, as the sequence is a token, and a token should be
9410 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9412 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9413 char *start_verb = RExC_parse;
9414 STRLEN verb_len = 0;
9415 char *start_arg = NULL;
9416 unsigned char op = 0;
9418 int internal_argval = 0; /* internal_argval is only useful if
9421 if (has_intervening_patws && SIZE_ONLY) {
9422 ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
9424 while ( *RExC_parse && *RExC_parse != ')' ) {
9425 if ( *RExC_parse == ':' ) {
9426 start_arg = RExC_parse + 1;
9432 verb_len = RExC_parse - start_verb;
9435 while ( *RExC_parse && *RExC_parse != ')' )
9437 if ( *RExC_parse != ')' )
9438 vFAIL("Unterminated verb pattern argument");
9439 if ( RExC_parse == start_arg )
9442 if ( *RExC_parse != ')' )
9443 vFAIL("Unterminated verb pattern");
9446 switch ( *start_verb ) {
9447 case 'A': /* (*ACCEPT) */
9448 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9450 internal_argval = RExC_nestroot;
9453 case 'C': /* (*COMMIT) */
9454 if ( memEQs(start_verb,verb_len,"COMMIT") )
9457 case 'F': /* (*FAIL) */
9458 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9463 case ':': /* (*:NAME) */
9464 case 'M': /* (*MARK:NAME) */
9465 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9470 case 'P': /* (*PRUNE) */
9471 if ( memEQs(start_verb,verb_len,"PRUNE") )
9474 case 'S': /* (*SKIP) */
9475 if ( memEQs(start_verb,verb_len,"SKIP") )
9478 case 'T': /* (*THEN) */
9479 /* [19:06] <TimToady> :: is then */
9480 if ( memEQs(start_verb,verb_len,"THEN") ) {
9482 RExC_seen |= REG_CUTGROUP_SEEN;
9487 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9489 "Unknown verb pattern '%"UTF8f"'",
9490 UTF8fARG(UTF, verb_len, start_verb));
9493 if ( start_arg && internal_argval ) {
9494 vFAIL3("Verb pattern '%.*s' may not have an argument",
9495 verb_len, start_verb);
9496 } else if ( argok < 0 && !start_arg ) {
9497 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9498 verb_len, start_verb);
9500 ret = reganode(pRExC_state, op, internal_argval);
9501 if ( ! internal_argval && ! SIZE_ONLY ) {
9503 SV *sv = newSVpvn( start_arg,
9504 RExC_parse - start_arg);
9505 ARG(ret) = add_data( pRExC_state,
9507 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9514 if (!internal_argval)
9515 RExC_seen |= REG_VERBARG_SEEN;
9516 } else if ( start_arg ) {
9517 vFAIL3("Verb pattern '%.*s' may not have an argument",
9518 verb_len, start_verb);
9520 ret = reg_node(pRExC_state, op);
9522 nextchar(pRExC_state);
9525 else if (*RExC_parse == '?') { /* (?...) */
9526 bool is_logical = 0;
9527 const char * const seqstart = RExC_parse;
9528 if (has_intervening_patws && SIZE_ONLY) {
9529 ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
9533 paren = *RExC_parse++;
9534 ret = NULL; /* For look-ahead/behind. */
9537 case 'P': /* (?P...) variants for those used to PCRE/Python */
9538 paren = *RExC_parse++;
9539 if ( paren == '<') /* (?P<...>) named capture */
9541 else if (paren == '>') { /* (?P>name) named recursion */
9542 goto named_recursion;
9544 else if (paren == '=') { /* (?P=...) named backref */
9545 /* this pretty much dupes the code for \k<NAME> in
9546 * regatom(), if you change this make sure you change that
9548 char* name_start = RExC_parse;
9550 SV *sv_dat = reg_scan_name(pRExC_state,
9551 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9552 if (RExC_parse == name_start || *RExC_parse != ')')
9553 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9554 vFAIL2("Sequence %.3s... not terminated",parse_start);
9557 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9558 RExC_rxi->data->data[num]=(void*)sv_dat;
9559 SvREFCNT_inc_simple_void(sv_dat);
9562 ret = reganode(pRExC_state,
9565 : (ASCII_FOLD_RESTRICTED)
9567 : (AT_LEAST_UNI_SEMANTICS)
9575 Set_Node_Offset(ret, parse_start+1);
9576 Set_Node_Cur_Length(ret, parse_start);
9578 nextchar(pRExC_state);
9582 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9583 vFAIL3("Sequence (%.*s...) not recognized",
9584 RExC_parse-seqstart, seqstart);
9586 case '<': /* (?<...) */
9587 if (*RExC_parse == '!')
9589 else if (*RExC_parse != '=')
9595 case '\'': /* (?'...') */
9596 name_start= RExC_parse;
9597 svname = reg_scan_name(pRExC_state,
9598 SIZE_ONLY /* reverse test from the others */
9599 ? REG_RSN_RETURN_NAME
9600 : REG_RSN_RETURN_NULL);
9601 if (RExC_parse == name_start || *RExC_parse != paren)
9602 vFAIL2("Sequence (?%c... not terminated",
9603 paren=='>' ? '<' : paren);
9607 if (!svname) /* shouldn't happen */
9609 "panic: reg_scan_name returned NULL");
9610 if (!RExC_paren_names) {
9611 RExC_paren_names= newHV();
9612 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9614 RExC_paren_name_list= newAV();
9615 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9618 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9620 sv_dat = HeVAL(he_str);
9622 /* croak baby croak */
9624 "panic: paren_name hash element allocation failed");
9625 } else if ( SvPOK(sv_dat) ) {
9626 /* (?|...) can mean we have dupes so scan to check
9627 its already been stored. Maybe a flag indicating
9628 we are inside such a construct would be useful,
9629 but the arrays are likely to be quite small, so
9630 for now we punt -- dmq */
9631 IV count = SvIV(sv_dat);
9632 I32 *pv = (I32*)SvPVX(sv_dat);
9634 for ( i = 0 ; i < count ; i++ ) {
9635 if ( pv[i] == RExC_npar ) {
9641 pv = (I32*)SvGROW(sv_dat,
9642 SvCUR(sv_dat) + sizeof(I32)+1);
9643 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9644 pv[count] = RExC_npar;
9645 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9648 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9649 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9652 SvIV_set(sv_dat, 1);
9655 /* Yes this does cause a memory leak in debugging Perls
9657 if (!av_store(RExC_paren_name_list,
9658 RExC_npar, SvREFCNT_inc(svname)))
9659 SvREFCNT_dec_NN(svname);
9662 /*sv_dump(sv_dat);*/
9664 nextchar(pRExC_state);
9666 goto capturing_parens;
9668 RExC_seen |= REG_LOOKBEHIND_SEEN;
9669 RExC_in_lookbehind++;
9671 case '=': /* (?=...) */
9672 RExC_seen_zerolen++;
9674 case '!': /* (?!...) */
9675 RExC_seen_zerolen++;
9676 if (*RExC_parse == ')') {
9677 ret=reg_node(pRExC_state, OPFAIL);
9678 nextchar(pRExC_state);
9682 case '|': /* (?|...) */
9683 /* branch reset, behave like a (?:...) except that
9684 buffers in alternations share the same numbers */
9686 after_freeze = freeze_paren = RExC_npar;
9688 case ':': /* (?:...) */
9689 case '>': /* (?>...) */
9691 case '$': /* (?$...) */
9692 case '@': /* (?@...) */
9693 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9695 case '#': /* (?#...) */
9696 /* XXX As soon as we disallow separating the '?' and '*' (by
9697 * spaces or (?#...) comment), it is believed that this case
9698 * will be unreachable and can be removed. See
9700 while (*RExC_parse && *RExC_parse != ')')
9702 if (*RExC_parse != ')')
9703 FAIL("Sequence (?#... not terminated");
9704 nextchar(pRExC_state);
9707 case '0' : /* (?0) */
9708 case 'R' : /* (?R) */
9709 if (*RExC_parse != ')')
9710 FAIL("Sequence (?R) not terminated");
9711 ret = reg_node(pRExC_state, GOSTART);
9712 RExC_seen |= REG_GOSTART_SEEN;
9713 *flagp |= POSTPONED;
9714 nextchar(pRExC_state);
9717 { /* named and numeric backreferences */
9719 case '&': /* (?&NAME) */
9720 parse_start = RExC_parse - 1;
9723 SV *sv_dat = reg_scan_name(pRExC_state,
9724 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9725 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9727 if (RExC_parse == RExC_end || *RExC_parse != ')')
9728 vFAIL("Sequence (?&... not terminated");
9729 goto gen_recurse_regop;
9730 assert(0); /* NOT REACHED */
9732 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9734 vFAIL("Illegal pattern");
9736 goto parse_recursion;
9738 case '-': /* (?-1) */
9739 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9740 RExC_parse--; /* rewind to let it be handled later */
9744 case '1': case '2': case '3': case '4': /* (?1) */
9745 case '5': case '6': case '7': case '8': case '9':
9748 num = atoi(RExC_parse);
9749 parse_start = RExC_parse - 1; /* MJD */
9750 if (*RExC_parse == '-')
9752 while (isDIGIT(*RExC_parse))
9754 if (*RExC_parse!=')')
9755 vFAIL("Expecting close bracket");
9758 if ( paren == '-' ) {
9760 Diagram of capture buffer numbering.
9761 Top line is the normal capture buffer numbers
9762 Bottom line is the negative indexing as from
9766 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9770 num = RExC_npar + num;
9773 vFAIL("Reference to nonexistent group");
9775 } else if ( paren == '+' ) {
9776 num = RExC_npar + num - 1;
9779 ret = reganode(pRExC_state, GOSUB, num);
9781 if (num > (I32)RExC_rx->nparens) {
9783 vFAIL("Reference to nonexistent group");
9785 ARG2L_SET( ret, RExC_recurse_count++);
9787 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9788 "Recurse #%"UVuf" to %"IVdf"\n",
9789 (UV)ARG(ret), (IV)ARG2L(ret)));
9793 RExC_seen |= REG_RECURSE_SEEN;
9794 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9795 Set_Node_Offset(ret, parse_start); /* MJD */
9797 *flagp |= POSTPONED;
9798 nextchar(pRExC_state);
9800 } /* named and numeric backreferences */
9801 assert(0); /* NOT REACHED */
9803 case '?': /* (??...) */
9805 if (*RExC_parse != '{') {
9807 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9809 "Sequence (%"UTF8f"...) not recognized",
9810 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9813 *flagp |= POSTPONED;
9814 paren = *RExC_parse++;
9816 case '{': /* (?{...}) */
9819 struct reg_code_block *cb;
9821 RExC_seen_zerolen++;
9823 if ( !pRExC_state->num_code_blocks
9824 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9825 || pRExC_state->code_blocks[pRExC_state->code_index].start
9826 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9829 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9830 FAIL("panic: Sequence (?{...}): no code block found\n");
9831 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9833 /* this is a pre-compiled code block (?{...}) */
9834 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9835 RExC_parse = RExC_start + cb->end;
9838 if (cb->src_regex) {
9839 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9840 RExC_rxi->data->data[n] =
9841 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9842 RExC_rxi->data->data[n+1] = (void*)o;
9845 n = add_data(pRExC_state,
9846 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9847 RExC_rxi->data->data[n] = (void*)o;
9850 pRExC_state->code_index++;
9851 nextchar(pRExC_state);
9855 ret = reg_node(pRExC_state, LOGICAL);
9856 eval = reganode(pRExC_state, EVAL, n);
9859 /* for later propagation into (??{}) return value */
9860 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9862 REGTAIL(pRExC_state, ret, eval);
9863 /* deal with the length of this later - MJD */
9866 ret = reganode(pRExC_state, EVAL, n);
9867 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9868 Set_Node_Offset(ret, parse_start);
9871 case '(': /* (?(?{...})...) and (?(?=...)...) */
9874 if (RExC_parse[0] == '?') { /* (?(?...)) */
9875 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9876 || RExC_parse[1] == '<'
9877 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9881 ret = reg_node(pRExC_state, LOGICAL);
9885 tail = reg(pRExC_state, 1, &flag, depth+1);
9886 if (flag & RESTART_UTF8) {
9887 *flagp = RESTART_UTF8;
9890 REGTAIL(pRExC_state, ret, tail);
9894 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9895 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9897 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9898 char *name_start= RExC_parse++;
9900 SV *sv_dat=reg_scan_name(pRExC_state,
9901 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9902 if (RExC_parse == name_start || *RExC_parse != ch)
9903 vFAIL2("Sequence (?(%c... not terminated",
9904 (ch == '>' ? '<' : ch));
9907 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9908 RExC_rxi->data->data[num]=(void*)sv_dat;
9909 SvREFCNT_inc_simple_void(sv_dat);
9911 ret = reganode(pRExC_state,NGROUPP,num);
9912 goto insert_if_check_paren;
9914 else if (RExC_parse[0] == 'D' &&
9915 RExC_parse[1] == 'E' &&
9916 RExC_parse[2] == 'F' &&
9917 RExC_parse[3] == 'I' &&
9918 RExC_parse[4] == 'N' &&
9919 RExC_parse[5] == 'E')
9921 ret = reganode(pRExC_state,DEFINEP,0);
9924 goto insert_if_check_paren;
9926 else if (RExC_parse[0] == 'R') {
9929 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9930 parno = atoi(RExC_parse++);
9931 while (isDIGIT(*RExC_parse))
9933 } else if (RExC_parse[0] == '&') {
9936 sv_dat = reg_scan_name(pRExC_state,
9938 ? REG_RSN_RETURN_NULL
9939 : REG_RSN_RETURN_DATA);
9940 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9942 ret = reganode(pRExC_state,INSUBP,parno);
9943 goto insert_if_check_paren;
9945 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
9949 parno = atoi(RExC_parse++);
9951 while (isDIGIT(*RExC_parse))
9953 ret = reganode(pRExC_state, GROUPP, parno);
9955 insert_if_check_paren:
9956 if (*(tmp = nextchar(pRExC_state)) != ')') {
9957 /* nextchar also skips comments, so undo its work
9958 * and skip over the the next character.
9961 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9962 vFAIL("Switch condition not recognized");
9965 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
9966 br = regbranch(pRExC_state, &flags, 1,depth+1);
9968 if (flags & RESTART_UTF8) {
9969 *flagp = RESTART_UTF8;
9972 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9975 REGTAIL(pRExC_state, br, reganode(pRExC_state,
9977 c = *nextchar(pRExC_state);
9982 vFAIL("(?(DEFINE)....) does not allow branches");
9984 /* Fake one for optimizer. */
9985 lastbr = reganode(pRExC_state, IFTHEN, 0);
9987 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
9988 if (flags & RESTART_UTF8) {
9989 *flagp = RESTART_UTF8;
9992 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
9995 REGTAIL(pRExC_state, ret, lastbr);
9998 c = *nextchar(pRExC_state);
10003 vFAIL("Switch (?(condition)... contains too many branches");
10004 ender = reg_node(pRExC_state, TAIL);
10005 REGTAIL(pRExC_state, br, ender);
10007 REGTAIL(pRExC_state, lastbr, ender);
10008 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10011 REGTAIL(pRExC_state, ret, ender);
10012 RExC_size++; /* XXX WHY do we need this?!!
10013 For large programs it seems to be required
10014 but I can't figure out why. -- dmq*/
10018 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10019 vFAIL("Unknown switch condition (?(...))");
10022 case '[': /* (?[ ... ]) */
10023 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10026 RExC_parse--; /* for vFAIL to print correctly */
10027 vFAIL("Sequence (? incomplete");
10029 default: /* e.g., (?i) */
10032 parse_lparen_question_flags(pRExC_state);
10033 if (UCHARAT(RExC_parse) != ':') {
10034 nextchar(pRExC_state);
10039 nextchar(pRExC_state);
10049 ret = reganode(pRExC_state, OPEN, parno);
10051 if (!RExC_nestroot)
10052 RExC_nestroot = parno;
10053 if (RExC_seen & REG_RECURSE_SEEN
10054 && !RExC_open_parens[parno-1])
10056 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10057 "Setting open paren #%"IVdf" to %d\n",
10058 (IV)parno, REG_NODE_NUM(ret)));
10059 RExC_open_parens[parno-1]= ret;
10062 Set_Node_Length(ret, 1); /* MJD */
10063 Set_Node_Offset(ret, RExC_parse); /* MJD */
10071 /* Pick up the branches, linking them together. */
10072 parse_start = RExC_parse; /* MJD */
10073 br = regbranch(pRExC_state, &flags, 1,depth+1);
10075 /* branch_len = (paren != 0); */
10078 if (flags & RESTART_UTF8) {
10079 *flagp = RESTART_UTF8;
10082 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10084 if (*RExC_parse == '|') {
10085 if (!SIZE_ONLY && RExC_extralen) {
10086 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10089 reginsert(pRExC_state, BRANCH, br, depth+1);
10090 Set_Node_Length(br, paren != 0);
10091 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10095 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10097 else if (paren == ':') {
10098 *flagp |= flags&SIMPLE;
10100 if (is_open) { /* Starts with OPEN. */
10101 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10103 else if (paren != '?') /* Not Conditional */
10105 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10107 while (*RExC_parse == '|') {
10108 if (!SIZE_ONLY && RExC_extralen) {
10109 ender = reganode(pRExC_state, LONGJMP,0);
10111 /* Append to the previous. */
10112 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10115 RExC_extralen += 2; /* Account for LONGJMP. */
10116 nextchar(pRExC_state);
10117 if (freeze_paren) {
10118 if (RExC_npar > after_freeze)
10119 after_freeze = RExC_npar;
10120 RExC_npar = freeze_paren;
10122 br = regbranch(pRExC_state, &flags, 0, depth+1);
10125 if (flags & RESTART_UTF8) {
10126 *flagp = RESTART_UTF8;
10129 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10131 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10133 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10136 if (have_branch || paren != ':') {
10137 /* Make a closing node, and hook it on the end. */
10140 ender = reg_node(pRExC_state, TAIL);
10143 ender = reganode(pRExC_state, CLOSE, parno);
10144 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10145 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10146 "Setting close paren #%"IVdf" to %d\n",
10147 (IV)parno, REG_NODE_NUM(ender)));
10148 RExC_close_parens[parno-1]= ender;
10149 if (RExC_nestroot == parno)
10152 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10153 Set_Node_Length(ender,1); /* MJD */
10159 *flagp &= ~HASWIDTH;
10162 ender = reg_node(pRExC_state, SUCCEED);
10165 ender = reg_node(pRExC_state, END);
10167 assert(!RExC_opend); /* there can only be one! */
10168 RExC_opend = ender;
10172 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10173 SV * const mysv_val1=sv_newmortal();
10174 SV * const mysv_val2=sv_newmortal();
10175 DEBUG_PARSE_MSG("lsbr");
10176 regprop(RExC_rx, mysv_val1, lastbr, NULL);
10177 regprop(RExC_rx, mysv_val2, ender, NULL);
10178 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10179 SvPV_nolen_const(mysv_val1),
10180 (IV)REG_NODE_NUM(lastbr),
10181 SvPV_nolen_const(mysv_val2),
10182 (IV)REG_NODE_NUM(ender),
10183 (IV)(ender - lastbr)
10186 REGTAIL(pRExC_state, lastbr, ender);
10188 if (have_branch && !SIZE_ONLY) {
10189 char is_nothing= 1;
10191 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10193 /* Hook the tails of the branches to the closing node. */
10194 for (br = ret; br; br = regnext(br)) {
10195 const U8 op = PL_regkind[OP(br)];
10196 if (op == BRANCH) {
10197 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10198 if ( OP(NEXTOPER(br)) != NOTHING
10199 || regnext(NEXTOPER(br)) != ender)
10202 else if (op == BRANCHJ) {
10203 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10204 /* for now we always disable this optimisation * /
10205 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10206 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10212 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10213 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10214 SV * const mysv_val1=sv_newmortal();
10215 SV * const mysv_val2=sv_newmortal();
10216 DEBUG_PARSE_MSG("NADA");
10217 regprop(RExC_rx, mysv_val1, ret, NULL);
10218 regprop(RExC_rx, mysv_val2, ender, NULL);
10219 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10220 SvPV_nolen_const(mysv_val1),
10221 (IV)REG_NODE_NUM(ret),
10222 SvPV_nolen_const(mysv_val2),
10223 (IV)REG_NODE_NUM(ender),
10228 if (OP(ender) == TAIL) {
10233 for ( opt= br + 1; opt < ender ; opt++ )
10234 OP(opt)= OPTIMIZED;
10235 NEXT_OFF(br)= ender - br;
10243 static const char parens[] = "=!<,>";
10245 if (paren && (p = strchr(parens, paren))) {
10246 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10247 int flag = (p - parens) > 1;
10250 node = SUSPEND, flag = 0;
10251 reginsert(pRExC_state, node,ret, depth+1);
10252 Set_Node_Cur_Length(ret, parse_start);
10253 Set_Node_Offset(ret, parse_start + 1);
10255 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10259 /* Check for proper termination. */
10261 /* restore original flags, but keep (?p) */
10262 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10263 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10264 RExC_parse = oregcomp_parse;
10265 vFAIL("Unmatched (");
10268 else if (!paren && RExC_parse < RExC_end) {
10269 if (*RExC_parse == ')') {
10271 vFAIL("Unmatched )");
10274 FAIL("Junk on end of regexp"); /* "Can't happen". */
10275 assert(0); /* NOTREACHED */
10278 if (RExC_in_lookbehind) {
10279 RExC_in_lookbehind--;
10281 if (after_freeze > RExC_npar)
10282 RExC_npar = after_freeze;
10287 - regbranch - one alternative of an | operator
10289 * Implements the concatenation operator.
10291 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10295 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10299 regnode *chain = NULL;
10301 I32 flags = 0, c = 0;
10302 GET_RE_DEBUG_FLAGS_DECL;
10304 PERL_ARGS_ASSERT_REGBRANCH;
10306 DEBUG_PARSE("brnc");
10311 if (!SIZE_ONLY && RExC_extralen)
10312 ret = reganode(pRExC_state, BRANCHJ,0);
10314 ret = reg_node(pRExC_state, BRANCH);
10315 Set_Node_Length(ret, 1);
10319 if (!first && SIZE_ONLY)
10320 RExC_extralen += 1; /* BRANCHJ */
10322 *flagp = WORST; /* Tentatively. */
10325 nextchar(pRExC_state);
10326 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10327 flags &= ~TRYAGAIN;
10328 latest = regpiece(pRExC_state, &flags,depth+1);
10329 if (latest == NULL) {
10330 if (flags & TRYAGAIN)
10332 if (flags & RESTART_UTF8) {
10333 *flagp = RESTART_UTF8;
10336 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10338 else if (ret == NULL)
10340 *flagp |= flags&(HASWIDTH|POSTPONED);
10341 if (chain == NULL) /* First piece. */
10342 *flagp |= flags&SPSTART;
10345 REGTAIL(pRExC_state, chain, latest);
10350 if (chain == NULL) { /* Loop ran zero times. */
10351 chain = reg_node(pRExC_state, NOTHING);
10356 *flagp |= flags&SIMPLE;
10363 - regpiece - something followed by possible [*+?]
10365 * Note that the branching code sequences used for ? and the general cases
10366 * of * and + are somewhat optimized: they use the same NOTHING node as
10367 * both the endmarker for their branch list and the body of the last branch.
10368 * It might seem that this node could be dispensed with entirely, but the
10369 * endmarker role is not redundant.
10371 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10373 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10377 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10384 const char * const origparse = RExC_parse;
10386 I32 max = REG_INFTY;
10387 #ifdef RE_TRACK_PATTERN_OFFSETS
10390 const char *maxpos = NULL;
10392 /* Save the original in case we change the emitted regop to a FAIL. */
10393 regnode * const orig_emit = RExC_emit;
10395 GET_RE_DEBUG_FLAGS_DECL;
10397 PERL_ARGS_ASSERT_REGPIECE;
10399 DEBUG_PARSE("piec");
10401 ret = regatom(pRExC_state, &flags,depth+1);
10403 if (flags & (TRYAGAIN|RESTART_UTF8))
10404 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10406 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10412 if (op == '{' && regcurly(RExC_parse, FALSE)) {
10414 #ifdef RE_TRACK_PATTERN_OFFSETS
10415 parse_start = RExC_parse; /* MJD */
10417 next = RExC_parse + 1;
10418 while (isDIGIT(*next) || *next == ',') {
10419 if (*next == ',') {
10427 if (*next == '}') { /* got one */
10431 min = atoi(RExC_parse);
10432 if (*maxpos == ',')
10435 maxpos = RExC_parse;
10436 max = atoi(maxpos);
10437 if (!max && *maxpos != '0')
10438 max = REG_INFTY; /* meaning "infinity" */
10439 else if (max >= REG_INFTY)
10440 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10442 nextchar(pRExC_state);
10443 if (max < min) { /* If can't match, warn and optimize to fail
10446 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10448 /* We can't back off the size because we have to reserve
10449 * enough space for all the things we are about to throw
10450 * away, but we can shrink it by the ammount we are about
10451 * to re-use here */
10452 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10455 RExC_emit = orig_emit;
10457 ret = reg_node(pRExC_state, OPFAIL);
10460 else if (min == max
10461 && RExC_parse < RExC_end
10462 && (*RExC_parse == '?' || *RExC_parse == '+'))
10465 ckWARN2reg(RExC_parse + 1,
10466 "Useless use of greediness modifier '%c'",
10469 /* Absorb the modifier, so later code doesn't see nor use
10471 nextchar(pRExC_state);
10475 if ((flags&SIMPLE)) {
10476 RExC_naughty += 2 + RExC_naughty / 2;
10477 reginsert(pRExC_state, CURLY, ret, depth+1);
10478 Set_Node_Offset(ret, parse_start+1); /* MJD */
10479 Set_Node_Cur_Length(ret, parse_start);
10482 regnode * const w = reg_node(pRExC_state, WHILEM);
10485 REGTAIL(pRExC_state, ret, w);
10486 if (!SIZE_ONLY && RExC_extralen) {
10487 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10488 reginsert(pRExC_state, NOTHING,ret, depth+1);
10489 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10491 reginsert(pRExC_state, CURLYX,ret, depth+1);
10493 Set_Node_Offset(ret, parse_start+1);
10494 Set_Node_Length(ret,
10495 op == '{' ? (RExC_parse - parse_start) : 1);
10497 if (!SIZE_ONLY && RExC_extralen)
10498 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10499 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10501 RExC_whilem_seen++, RExC_extralen += 3;
10502 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10509 *flagp |= HASWIDTH;
10511 ARG1_SET(ret, (U16)min);
10512 ARG2_SET(ret, (U16)max);
10514 if (max == REG_INFTY)
10515 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10521 if (!ISMULT1(op)) {
10526 #if 0 /* Now runtime fix should be reliable. */
10528 /* if this is reinstated, don't forget to put this back into perldiag:
10530 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10532 (F) The part of the regexp subject to either the * or + quantifier
10533 could match an empty string. The {#} shows in the regular
10534 expression about where the problem was discovered.
10538 if (!(flags&HASWIDTH) && op != '?')
10539 vFAIL("Regexp *+ operand could be empty");
10542 #ifdef RE_TRACK_PATTERN_OFFSETS
10543 parse_start = RExC_parse;
10545 nextchar(pRExC_state);
10547 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10549 if (op == '*' && (flags&SIMPLE)) {
10550 reginsert(pRExC_state, STAR, ret, depth+1);
10553 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10555 else if (op == '*') {
10559 else if (op == '+' && (flags&SIMPLE)) {
10560 reginsert(pRExC_state, PLUS, ret, depth+1);
10563 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10565 else if (op == '+') {
10569 else if (op == '?') {
10574 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10575 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10576 ckWARN2reg(RExC_parse,
10577 "%"UTF8f" matches null string many times",
10578 UTF8fARG(UTF, (RExC_parse >= origparse
10579 ? RExC_parse - origparse
10582 (void)ReREFCNT_inc(RExC_rx_sv);
10585 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10586 nextchar(pRExC_state);
10587 reginsert(pRExC_state, MINMOD, ret, depth+1);
10588 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10591 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10593 nextchar(pRExC_state);
10594 ender = reg_node(pRExC_state, SUCCEED);
10595 REGTAIL(pRExC_state, ret, ender);
10596 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10598 ender = reg_node(pRExC_state, TAIL);
10599 REGTAIL(pRExC_state, ret, ender);
10602 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10604 vFAIL("Nested quantifiers");
10611 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10612 UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10613 const bool strict /* Apply stricter parsing rules? */
10617 /* This is expected to be called by a parser routine that has recognized '\N'
10618 and needs to handle the rest. RExC_parse is expected to point at the first
10619 char following the N at the time of the call. On successful return,
10620 RExC_parse has been updated to point to just after the sequence identified
10621 by this routine, and <*flagp> has been updated.
10623 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10626 \N may begin either a named sequence, or if outside a character class, mean
10627 to match a non-newline. For non single-quoted regexes, the tokenizer has
10628 attempted to decide which, and in the case of a named sequence, converted it
10629 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10630 where c1... are the characters in the sequence. For single-quoted regexes,
10631 the tokenizer passes the \N sequence through unchanged; this code will not
10632 attempt to determine this nor expand those, instead raising a syntax error.
10633 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10634 or there is no '}', it signals that this \N occurrence means to match a
10637 Only the \N{U+...} form should occur in a character class, for the same
10638 reason that '.' inside a character class means to just match a period: it
10639 just doesn't make sense.
10641 The function raises an error (via vFAIL), and doesn't return for various
10642 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10643 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10644 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10645 only possible if node_p is non-NULL.
10648 If <valuep> is non-null, it means the caller can accept an input sequence
10649 consisting of a just a single code point; <*valuep> is set to that value
10650 if the input is such.
10652 If <node_p> is non-null it signifies that the caller can accept any other
10653 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10655 1) \N means not-a-NL: points to a newly created REG_ANY node;
10656 2) \N{}: points to a new NOTHING node;
10657 3) otherwise: points to a new EXACT node containing the resolved
10659 Note that FALSE is returned for single code point sequences if <valuep> is
10663 char * endbrace; /* '}' following the name */
10665 char *endchar; /* Points to '.' or '}' ending cur char in the input
10667 bool has_multiple_chars; /* true if the input stream contains a sequence of
10668 more than one character */
10670 GET_RE_DEBUG_FLAGS_DECL;
10672 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10674 GET_RE_DEBUG_FLAGS;
10676 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10678 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10679 * modifier. The other meaning does not, so use a temporary until we find
10680 * out which we are being called with */
10681 p = (RExC_flags & RXf_PMf_EXTENDED)
10682 ? regwhite( pRExC_state, RExC_parse )
10685 /* Disambiguate between \N meaning a named character versus \N meaning
10686 * [^\n]. The former is assumed when it can't be the latter. */
10687 if (*p != '{' || regcurly(p, FALSE)) {
10690 /* no bare \N allowed in a charclass */
10691 if (in_char_class) {
10692 vFAIL("\\N in a character class must be a named character: \\N{...}");
10696 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10698 nextchar(pRExC_state);
10699 *node_p = reg_node(pRExC_state, REG_ANY);
10700 *flagp |= HASWIDTH|SIMPLE;
10702 Set_Node_Length(*node_p, 1); /* MJD */
10706 /* Here, we have decided it should be a named character or sequence */
10708 /* The test above made sure that the next real character is a '{', but
10709 * under the /x modifier, it could be separated by space (or a comment and
10710 * \n) and this is not allowed (for consistency with \x{...} and the
10711 * tokenizer handling of \N{NAME}). */
10712 if (*RExC_parse != '{') {
10713 vFAIL("Missing braces on \\N{}");
10716 RExC_parse++; /* Skip past the '{' */
10718 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10719 || ! (endbrace == RExC_parse /* nothing between the {} */
10720 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10722 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10725 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10726 vFAIL("\\N{NAME} must be resolved by the lexer");
10729 if (endbrace == RExC_parse) { /* empty: \N{} */
10732 *node_p = reg_node(pRExC_state,NOTHING);
10734 else if (in_char_class) {
10735 if (SIZE_ONLY && in_char_class) {
10737 RExC_parse++; /* Position after the "}" */
10738 vFAIL("Zero length \\N{}");
10741 ckWARNreg(RExC_parse,
10742 "Ignoring zero length \\N{} in character class");
10750 nextchar(pRExC_state);
10754 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10755 RExC_parse += 2; /* Skip past the 'U+' */
10757 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10759 /* Code points are separated by dots. If none, there is only one code
10760 * point, and is terminated by the brace */
10761 has_multiple_chars = (endchar < endbrace);
10763 if (valuep && (! has_multiple_chars || in_char_class)) {
10764 /* We only pay attention to the first char of
10765 multichar strings being returned in char classes. I kinda wonder
10766 if this makes sense as it does change the behaviour
10767 from earlier versions, OTOH that behaviour was broken
10768 as well. XXX Solution is to recharacterize as
10769 [rest-of-class]|multi1|multi2... */
10771 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10772 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10773 | PERL_SCAN_DISALLOW_PREFIX
10774 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10776 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10778 /* The tokenizer should have guaranteed validity, but it's possible to
10779 * bypass it by using single quoting, so check */
10780 if (length_of_hex == 0
10781 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10783 RExC_parse += length_of_hex; /* Includes all the valid */
10784 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10785 ? UTF8SKIP(RExC_parse)
10787 /* Guard against malformed utf8 */
10788 if (RExC_parse >= endchar) {
10789 RExC_parse = endchar;
10791 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10794 if (in_char_class && has_multiple_chars) {
10796 RExC_parse = endbrace;
10797 vFAIL("\\N{} in character class restricted to one character");
10800 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10804 RExC_parse = endbrace + 1;
10806 else if (! node_p || ! has_multiple_chars) {
10808 /* Here, the input is legal, but not according to the caller's
10809 * options. We fail without advancing the parse, so that the
10810 * caller can try again */
10816 /* What is done here is to convert this to a sub-pattern of the form
10817 * (?:\x{char1}\x{char2}...)
10818 * and then call reg recursively. That way, it retains its atomicness,
10819 * while not having to worry about special handling that some code
10820 * points may have. toke.c has converted the original Unicode values
10821 * to native, so that we can just pass on the hex values unchanged. We
10822 * do have to set a flag to keep recoding from happening in the
10825 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10827 char *orig_end = RExC_end;
10830 while (RExC_parse < endbrace) {
10832 /* Convert to notation the rest of the code understands */
10833 sv_catpv(substitute_parse, "\\x{");
10834 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10835 sv_catpv(substitute_parse, "}");
10837 /* Point to the beginning of the next character in the sequence. */
10838 RExC_parse = endchar + 1;
10839 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10841 sv_catpv(substitute_parse, ")");
10843 RExC_parse = SvPV(substitute_parse, len);
10845 /* Don't allow empty number */
10847 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10849 RExC_end = RExC_parse + len;
10851 /* The values are Unicode, and therefore not subject to recoding */
10852 RExC_override_recoding = 1;
10854 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10855 if (flags & RESTART_UTF8) {
10856 *flagp = RESTART_UTF8;
10859 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10862 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10864 RExC_parse = endbrace;
10865 RExC_end = orig_end;
10866 RExC_override_recoding = 0;
10868 nextchar(pRExC_state);
10878 * It returns the code point in utf8 for the value in *encp.
10879 * value: a code value in the source encoding
10880 * encp: a pointer to an Encode object
10882 * If the result from Encode is not a single character,
10883 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10886 S_reg_recode(pTHX_ const char value, SV **encp)
10889 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10890 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10891 const STRLEN newlen = SvCUR(sv);
10892 UV uv = UNICODE_REPLACEMENT;
10894 PERL_ARGS_ASSERT_REG_RECODE;
10898 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10901 if (!newlen || numlen != newlen) {
10902 uv = UNICODE_REPLACEMENT;
10908 PERL_STATIC_INLINE U8
10909 S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
10913 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10919 op = get_regex_charset(RExC_flags);
10920 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10921 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10922 been, so there is no hole */
10925 return op + EXACTF;
10928 PERL_STATIC_INLINE void
10929 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
10930 regnode *node, I32* flagp, STRLEN len, UV code_point,
10933 /* This knows the details about sizing an EXACTish node, setting flags for
10934 * it (by setting <*flagp>, and potentially populating it with a single
10937 * If <len> (the length in bytes) is non-zero, this function assumes that
10938 * the node has already been populated, and just does the sizing. In this
10939 * case <code_point> should be the final code point that has already been
10940 * placed into the node. This value will be ignored except that under some
10941 * circumstances <*flagp> is set based on it.
10943 * If <len> is zero, the function assumes that the node is to contain only
10944 * the single character given by <code_point> and calculates what <len>
10945 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
10946 * additionally will populate the node's STRING with <code_point> or its
10949 * In both cases <*flagp> is appropriately set
10951 * It knows that under FOLD, the Latin Sharp S and UTF characters above
10952 * 255, must be folded (the former only when the rules indicate it can
10955 * When it does the populating, it looks at the flag 'downgradable'. If
10956 * true with a node that folds, it checks if the single code point
10957 * participates in a fold, and if not downgrades the node to an EXACT.
10958 * This helps the optimizer */
10960 bool len_passed_in = cBOOL(len != 0);
10961 U8 character[UTF8_MAXBYTES_CASE+1];
10963 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
10965 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
10966 * sizing difference, and is extra work that is thrown away */
10967 if (downgradable && ! PASS2) {
10968 downgradable = FALSE;
10971 if (! len_passed_in) {
10973 if (UNI_IS_INVARIANT(code_point)) {
10974 if (LOC || ! FOLD) { /* /l defers folding until runtime */
10975 *character = (U8) code_point;
10977 else { /* Here is /i and not /l (toFOLD() is defined on just
10978 ASCII, which isn't the same thing as INVARIANT on
10979 EBCDIC, but it works there, as the extra invariants
10980 fold to themselves) */
10981 *character = toFOLD((U8) code_point);
10983 /* We can downgrade to an EXACT node if this character
10984 * isn't a folding one. Note that this assumes that
10985 * nothing above Latin1 folds to some other invariant than
10986 * one of these alphabetics; otherwise we would also have
10988 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
10989 * || ASCII_FOLD_RESTRICTED))
10991 if (downgradable && PL_fold[code_point] == code_point) {
10997 else if (FOLD && (! LOC
10998 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
10999 { /* Folding, and ok to do so now */
11000 UV folded = _to_uni_fold_flags(
11004 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11005 ? FOLD_FLAGS_NOMIX_ASCII
11008 && folded == code_point
11009 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11014 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11016 /* Not folding this cp, and can output it directly */
11017 *character = UTF8_TWO_BYTE_HI(code_point);
11018 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11022 uvchr_to_utf8( character, code_point);
11023 len = UTF8SKIP(character);
11025 } /* Else pattern isn't UTF8. */
11027 *character = (U8) code_point;
11029 } /* Else is folded non-UTF8 */
11030 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11032 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11033 * comments at join_exact()); */
11034 *character = (U8) code_point;
11037 /* Can turn into an EXACT node if we know the fold at compile time,
11038 * and it folds to itself and doesn't particpate in other folds */
11041 && PL_fold_latin1[code_point] == code_point
11042 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11043 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11047 } /* else is Sharp s. May need to fold it */
11048 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11050 *(character + 1) = 's';
11054 *character = LATIN_SMALL_LETTER_SHARP_S;
11060 RExC_size += STR_SZ(len);
11063 RExC_emit += STR_SZ(len);
11064 STR_LEN(node) = len;
11065 if (! len_passed_in) {
11066 Copy((char *) character, STRING(node), len, char);
11070 *flagp |= HASWIDTH;
11072 /* A single character node is SIMPLE, except for the special-cased SHARP S
11074 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11075 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11076 || ! FOLD || ! DEPENDS_SEMANTICS))
11081 /* The OP may not be well defined in PASS1 */
11082 if (PASS2 && OP(node) == EXACTFL) {
11083 RExC_contains_locale = 1;
11088 /* return atoi(p), unless it's too big to sensibly be a backref,
11089 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11092 S_backref_value(char *p)
11096 for (;isDIGIT(*q); q++); /* calculate length of num */
11097 if (q - p == 0 || q - p > 9)
11104 - regatom - the lowest level
11106 Try to identify anything special at the start of the pattern. If there
11107 is, then handle it as required. This may involve generating a single regop,
11108 such as for an assertion; or it may involve recursing, such as to
11109 handle a () structure.
11111 If the string doesn't start with something special then we gobble up
11112 as much literal text as we can.
11114 Once we have been able to handle whatever type of thing started the
11115 sequence, we return.
11117 Note: we have to be careful with escapes, as they can be both literal
11118 and special, and in the case of \10 and friends, context determines which.
11120 A summary of the code structure is:
11122 switch (first_byte) {
11123 cases for each special:
11124 handle this special;
11127 switch (2nd byte) {
11128 cases for each unambiguous special:
11129 handle this special;
11131 cases for each ambigous special/literal:
11133 if (special) handle here
11135 default: // unambiguously literal:
11138 default: // is a literal char
11141 create EXACTish node for literal;
11142 while (more input and node isn't full) {
11143 switch (input_byte) {
11144 cases for each special;
11145 make sure parse pointer is set so that the next call to
11146 regatom will see this special first
11147 goto loopdone; // EXACTish node terminated by prev. char
11149 append char to EXACTISH node;
11151 get next input byte;
11155 return the generated node;
11157 Specifically there are two separate switches for handling
11158 escape sequences, with the one for handling literal escapes requiring
11159 a dummy entry for all of the special escapes that are actually handled
11162 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11164 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11166 Otherwise does not return NULL.
11170 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11173 regnode *ret = NULL;
11175 char *parse_start = RExC_parse;
11179 GET_RE_DEBUG_FLAGS_DECL;
11181 *flagp = WORST; /* Tentatively. */
11183 DEBUG_PARSE("atom");
11185 PERL_ARGS_ASSERT_REGATOM;
11188 switch ((U8)*RExC_parse) {
11190 RExC_seen_zerolen++;
11191 nextchar(pRExC_state);
11192 if (RExC_flags & RXf_PMf_MULTILINE)
11193 ret = reg_node(pRExC_state, MBOL);
11194 else if (RExC_flags & RXf_PMf_SINGLELINE)
11195 ret = reg_node(pRExC_state, SBOL);
11197 ret = reg_node(pRExC_state, BOL);
11198 Set_Node_Length(ret, 1); /* MJD */
11201 nextchar(pRExC_state);
11203 RExC_seen_zerolen++;
11204 if (RExC_flags & RXf_PMf_MULTILINE)
11205 ret = reg_node(pRExC_state, MEOL);
11206 else if (RExC_flags & RXf_PMf_SINGLELINE)
11207 ret = reg_node(pRExC_state, SEOL);
11209 ret = reg_node(pRExC_state, EOL);
11210 Set_Node_Length(ret, 1); /* MJD */
11213 nextchar(pRExC_state);
11214 if (RExC_flags & RXf_PMf_SINGLELINE)
11215 ret = reg_node(pRExC_state, SANY);
11217 ret = reg_node(pRExC_state, REG_ANY);
11218 *flagp |= HASWIDTH|SIMPLE;
11220 Set_Node_Length(ret, 1); /* MJD */
11224 char * const oregcomp_parse = ++RExC_parse;
11225 ret = regclass(pRExC_state, flagp,depth+1,
11226 FALSE, /* means parse the whole char class */
11227 TRUE, /* allow multi-char folds */
11228 FALSE, /* don't silence non-portable warnings. */
11230 if (*RExC_parse != ']') {
11231 RExC_parse = oregcomp_parse;
11232 vFAIL("Unmatched [");
11235 if (*flagp & RESTART_UTF8)
11237 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11240 nextchar(pRExC_state);
11241 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11245 nextchar(pRExC_state);
11246 ret = reg(pRExC_state, 2, &flags,depth+1);
11248 if (flags & TRYAGAIN) {
11249 if (RExC_parse == RExC_end) {
11250 /* Make parent create an empty node if needed. */
11251 *flagp |= TRYAGAIN;
11256 if (flags & RESTART_UTF8) {
11257 *flagp = RESTART_UTF8;
11260 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11263 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11267 if (flags & TRYAGAIN) {
11268 *flagp |= TRYAGAIN;
11271 vFAIL("Internal urp");
11272 /* Supposed to be caught earlier. */
11275 if (!regcurly(RExC_parse, FALSE)) {
11284 vFAIL("Quantifier follows nothing");
11289 This switch handles escape sequences that resolve to some kind
11290 of special regop and not to literal text. Escape sequnces that
11291 resolve to literal text are handled below in the switch marked
11294 Every entry in this switch *must* have a corresponding entry
11295 in the literal escape switch. However, the opposite is not
11296 required, as the default for this switch is to jump to the
11297 literal text handling code.
11299 switch ((U8)*++RExC_parse) {
11301 /* Special Escapes */
11303 RExC_seen_zerolen++;
11304 ret = reg_node(pRExC_state, SBOL);
11306 goto finish_meta_pat;
11308 ret = reg_node(pRExC_state, GPOS);
11309 RExC_seen |= REG_GPOS_SEEN;
11311 goto finish_meta_pat;
11313 RExC_seen_zerolen++;
11314 ret = reg_node(pRExC_state, KEEPS);
11316 /* XXX:dmq : disabling in-place substitution seems to
11317 * be necessary here to avoid cases of memory corruption, as
11318 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11320 RExC_seen |= REG_LOOKBEHIND_SEEN;
11321 goto finish_meta_pat;
11323 ret = reg_node(pRExC_state, SEOL);
11325 RExC_seen_zerolen++; /* Do not optimize RE away */
11326 goto finish_meta_pat;
11328 ret = reg_node(pRExC_state, EOS);
11330 RExC_seen_zerolen++; /* Do not optimize RE away */
11331 goto finish_meta_pat;
11333 ret = reg_node(pRExC_state, CANY);
11334 RExC_seen |= REG_CANY_SEEN;
11335 *flagp |= HASWIDTH|SIMPLE;
11336 goto finish_meta_pat;
11338 ret = reg_node(pRExC_state, CLUMP);
11339 *flagp |= HASWIDTH;
11340 goto finish_meta_pat;
11346 arg = ANYOF_WORDCHAR;
11350 RExC_seen_zerolen++;
11351 RExC_seen |= REG_LOOKBEHIND_SEEN;
11352 op = BOUND + get_regex_charset(RExC_flags);
11353 if (op > BOUNDA) { /* /aa is same as /a */
11356 else if (op == BOUNDL) {
11357 RExC_contains_locale = 1;
11359 ret = reg_node(pRExC_state, op);
11360 FLAGS(ret) = get_regex_charset(RExC_flags);
11362 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11363 /* diag_listed_as: Use "%s" instead of "%s" */
11364 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11366 goto finish_meta_pat;
11368 RExC_seen_zerolen++;
11369 RExC_seen |= REG_LOOKBEHIND_SEEN;
11370 op = NBOUND + get_regex_charset(RExC_flags);
11371 if (op > NBOUNDA) { /* /aa is same as /a */
11374 else if (op == NBOUNDL) {
11375 RExC_contains_locale = 1;
11377 ret = reg_node(pRExC_state, op);
11378 FLAGS(ret) = get_regex_charset(RExC_flags);
11380 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11381 /* diag_listed_as: Use "%s" instead of "%s" */
11382 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11384 goto finish_meta_pat;
11394 ret = reg_node(pRExC_state, LNBREAK);
11395 *flagp |= HASWIDTH|SIMPLE;
11396 goto finish_meta_pat;
11404 goto join_posix_op_known;
11410 arg = ANYOF_VERTWS;
11412 goto join_posix_op_known;
11422 op = POSIXD + get_regex_charset(RExC_flags);
11423 if (op > POSIXA) { /* /aa is same as /a */
11426 else if (op == POSIXL) {
11427 RExC_contains_locale = 1;
11430 join_posix_op_known:
11433 op += NPOSIXD - POSIXD;
11436 ret = reg_node(pRExC_state, op);
11438 FLAGS(ret) = namedclass_to_classnum(arg);
11441 *flagp |= HASWIDTH|SIMPLE;
11445 nextchar(pRExC_state);
11446 Set_Node_Length(ret, 2); /* MJD */
11452 char* parse_start = RExC_parse - 2;
11457 ret = regclass(pRExC_state, flagp,depth+1,
11458 TRUE, /* means just parse this element */
11459 FALSE, /* don't allow multi-char folds */
11460 FALSE, /* don't silence non-portable warnings.
11461 It would be a bug if these returned
11464 /* regclass() can only return RESTART_UTF8 if multi-char folds
11467 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11472 Set_Node_Offset(ret, parse_start + 2);
11473 Set_Node_Cur_Length(ret, parse_start);
11474 nextchar(pRExC_state);
11478 /* Handle \N and \N{NAME} with multiple code points here and not
11479 * below because it can be multicharacter. join_exact() will join
11480 * them up later on. Also this makes sure that things like
11481 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11482 * The options to the grok function call causes it to fail if the
11483 * sequence is just a single code point. We then go treat it as
11484 * just another character in the current EXACT node, and hence it
11485 * gets uniform treatment with all the other characters. The
11486 * special treatment for quantifiers is not needed for such single
11487 * character sequences */
11489 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11490 FALSE /* not strict */ )) {
11491 if (*flagp & RESTART_UTF8)
11497 case 'k': /* Handle \k<NAME> and \k'NAME' */
11500 char ch= RExC_parse[1];
11501 if (ch != '<' && ch != '\'' && ch != '{') {
11503 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11504 vFAIL2("Sequence %.2s... not terminated",parse_start);
11506 /* this pretty much dupes the code for (?P=...) in reg(), if
11507 you change this make sure you change that */
11508 char* name_start = (RExC_parse += 2);
11510 SV *sv_dat = reg_scan_name(pRExC_state,
11511 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11512 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11513 if (RExC_parse == name_start || *RExC_parse != ch)
11514 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11515 vFAIL2("Sequence %.3s... not terminated",parse_start);
11518 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11519 RExC_rxi->data->data[num]=(void*)sv_dat;
11520 SvREFCNT_inc_simple_void(sv_dat);
11524 ret = reganode(pRExC_state,
11527 : (ASCII_FOLD_RESTRICTED)
11529 : (AT_LEAST_UNI_SEMANTICS)
11535 *flagp |= HASWIDTH;
11537 /* override incorrect value set in reganode MJD */
11538 Set_Node_Offset(ret, parse_start+1);
11539 Set_Node_Cur_Length(ret, parse_start);
11540 nextchar(pRExC_state);
11546 case '1': case '2': case '3': case '4':
11547 case '5': case '6': case '7': case '8': case '9':
11552 if (*RExC_parse == 'g') {
11556 if (*RExC_parse == '{') {
11560 if (*RExC_parse == '-') {
11564 if (hasbrace && !isDIGIT(*RExC_parse)) {
11565 if (isrel) RExC_parse--;
11567 goto parse_named_seq;
11570 num = S_backref_value(RExC_parse);
11572 vFAIL("Reference to invalid group 0");
11573 else if (num == I32_MAX) {
11574 if (isDIGIT(*RExC_parse))
11575 vFAIL("Reference to nonexistent group");
11577 vFAIL("Unterminated \\g... pattern");
11581 num = RExC_npar - num;
11583 vFAIL("Reference to nonexistent or unclosed group");
11587 num = S_backref_value(RExC_parse);
11588 /* bare \NNN might be backref or octal - if it is larger than or equal
11589 * RExC_npar then it is assumed to be and octal escape.
11590 * Note RExC_npar is +1 from the actual number of parens*/
11591 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11592 && *RExC_parse != '8' && *RExC_parse != '9'))
11594 /* Probably a character specified in octal, e.g. \35 */
11599 /* at this point RExC_parse definitely points to a backref
11602 #ifdef RE_TRACK_PATTERN_OFFSETS
11603 char * const parse_start = RExC_parse - 1; /* MJD */
11605 while (isDIGIT(*RExC_parse))
11608 if (*RExC_parse != '}')
11609 vFAIL("Unterminated \\g{...} pattern");
11613 if (num > (I32)RExC_rx->nparens)
11614 vFAIL("Reference to nonexistent group");
11617 ret = reganode(pRExC_state,
11620 : (ASCII_FOLD_RESTRICTED)
11622 : (AT_LEAST_UNI_SEMANTICS)
11628 *flagp |= HASWIDTH;
11630 /* override incorrect value set in reganode MJD */
11631 Set_Node_Offset(ret, parse_start+1);
11632 Set_Node_Cur_Length(ret, parse_start);
11634 nextchar(pRExC_state);
11639 if (RExC_parse >= RExC_end)
11640 FAIL("Trailing \\");
11643 /* Do not generate "unrecognized" warnings here, we fall
11644 back into the quick-grab loop below */
11651 if (RExC_flags & RXf_PMf_EXTENDED) {
11652 if ( reg_skipcomment( pRExC_state ) )
11659 parse_start = RExC_parse - 1;
11668 #define MAX_NODE_STRING_SIZE 127
11669 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11671 U8 upper_parse = MAX_NODE_STRING_SIZE;
11672 U8 node_type = compute_EXACTish(pRExC_state);
11673 bool next_is_quantifier;
11674 char * oldp = NULL;
11676 /* We can convert EXACTF nodes to EXACTFU if they contain only
11677 * characters that match identically regardless of the target
11678 * string's UTF8ness. The reason to do this is that EXACTF is not
11679 * trie-able, EXACTFU is.
11681 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11682 * contain only above-Latin1 characters (hence must be in UTF8),
11683 * which don't participate in folds with Latin1-range characters,
11684 * as the latter's folds aren't known until runtime. (We don't
11685 * need to figure this out until pass 2) */
11686 bool maybe_exactfu = PASS2
11687 && (node_type == EXACTF || node_type == EXACTFL);
11689 /* If a folding node contains only code points that don't
11690 * participate in folds, it can be changed into an EXACT node,
11691 * which allows the optimizer more things to look for */
11694 ret = reg_node(pRExC_state, node_type);
11696 /* In pass1, folded, we use a temporary buffer instead of the
11697 * actual node, as the node doesn't exist yet */
11698 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11704 /* We do the EXACTFish to EXACT node only if folding. (And we
11705 * don't need to figure this out until pass 2) */
11706 maybe_exact = FOLD && PASS2;
11708 /* XXX The node can hold up to 255 bytes, yet this only goes to
11709 * 127. I (khw) do not know why. Keeping it somewhat less than
11710 * 255 allows us to not have to worry about overflow due to
11711 * converting to utf8 and fold expansion, but that value is
11712 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11713 * split up by this limit into a single one using the real max of
11714 * 255. Even at 127, this breaks under rare circumstances. If
11715 * folding, we do not want to split a node at a character that is a
11716 * non-final in a multi-char fold, as an input string could just
11717 * happen to want to match across the node boundary. The join
11718 * would solve that problem if the join actually happens. But a
11719 * series of more than two nodes in a row each of 127 would cause
11720 * the first join to succeed to get to 254, but then there wouldn't
11721 * be room for the next one, which could at be one of those split
11722 * multi-char folds. I don't know of any fool-proof solution. One
11723 * could back off to end with only a code point that isn't such a
11724 * non-final, but it is possible for there not to be any in the
11726 for (p = RExC_parse - 1;
11727 len < upper_parse && p < RExC_end;
11732 if (RExC_flags & RXf_PMf_EXTENDED)
11733 p = regwhite( pRExC_state, p );
11744 /* Literal Escapes Switch
11746 This switch is meant to handle escape sequences that
11747 resolve to a literal character.
11749 Every escape sequence that represents something
11750 else, like an assertion or a char class, is handled
11751 in the switch marked 'Special Escapes' above in this
11752 routine, but also has an entry here as anything that
11753 isn't explicitly mentioned here will be treated as
11754 an unescaped equivalent literal.
11757 switch ((U8)*++p) {
11758 /* These are all the special escapes. */
11759 case 'A': /* Start assertion */
11760 case 'b': case 'B': /* Word-boundary assertion*/
11761 case 'C': /* Single char !DANGEROUS! */
11762 case 'd': case 'D': /* digit class */
11763 case 'g': case 'G': /* generic-backref, pos assertion */
11764 case 'h': case 'H': /* HORIZWS */
11765 case 'k': case 'K': /* named backref, keep marker */
11766 case 'p': case 'P': /* Unicode property */
11767 case 'R': /* LNBREAK */
11768 case 's': case 'S': /* space class */
11769 case 'v': case 'V': /* VERTWS */
11770 case 'w': case 'W': /* word class */
11771 case 'X': /* eXtended Unicode "combining
11772 character sequence" */
11773 case 'z': case 'Z': /* End of line/string assertion */
11777 /* Anything after here is an escape that resolves to a
11778 literal. (Except digits, which may or may not)
11784 case 'N': /* Handle a single-code point named character. */
11785 /* The options cause it to fail if a multiple code
11786 * point sequence. Handle those in the switch() above
11788 RExC_parse = p + 1;
11789 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11790 flagp, depth, FALSE,
11791 FALSE /* not strict */ ))
11793 if (*flagp & RESTART_UTF8)
11794 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11795 RExC_parse = p = oldp;
11799 if (ender > 0xff) {
11816 ender = ASCII_TO_NATIVE('\033');
11826 const char* error_msg;
11828 bool valid = grok_bslash_o(&p,
11831 TRUE, /* out warnings */
11832 FALSE, /* not strict */
11833 TRUE, /* Output warnings
11838 RExC_parse = p; /* going to die anyway; point
11839 to exact spot of failure */
11843 if (PL_encoding && ender < 0x100) {
11844 goto recode_encoding;
11846 if (ender > 0xff) {
11853 UV result = UV_MAX; /* initialize to erroneous
11855 const char* error_msg;
11857 bool valid = grok_bslash_x(&p,
11860 TRUE, /* out warnings */
11861 FALSE, /* not strict */
11862 TRUE, /* Output warnings
11867 RExC_parse = p; /* going to die anyway; point
11868 to exact spot of failure */
11873 if (PL_encoding && ender < 0x100) {
11874 goto recode_encoding;
11876 if (ender > 0xff) {
11883 ender = grok_bslash_c(*p++, SIZE_ONLY);
11885 case '8': case '9': /* must be a backreference */
11888 case '1': case '2': case '3':case '4':
11889 case '5': case '6': case '7':
11890 /* When we parse backslash escapes there is ambiguity
11891 * between backreferences and octal escapes. Any escape
11892 * from \1 - \9 is a backreference, any multi-digit
11893 * escape which does not start with 0 and which when
11894 * evaluated as decimal could refer to an already
11895 * parsed capture buffer is a backslash. Anything else
11898 * Note this implies that \118 could be interpreted as
11899 * 118 OR as "\11" . "8" depending on whether there
11900 * were 118 capture buffers defined already in the
11903 /* NOTE, RExC_npar is 1 more than the actual number of
11904 * parens we have seen so far, hence the < RExC_npar below. */
11906 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11907 { /* Not to be treated as an octal constant, go
11914 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11916 ender = grok_oct(p, &numlen, &flags, NULL);
11917 if (ender > 0xff) {
11921 if (SIZE_ONLY /* like \08, \178 */
11924 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11926 reg_warn_non_literal_string(
11928 form_short_octal_warning(p, numlen));
11931 if (PL_encoding && ender < 0x100)
11932 goto recode_encoding;
11935 if (! RExC_override_recoding) {
11936 SV* enc = PL_encoding;
11937 ender = reg_recode((const char)(U8)ender, &enc);
11938 if (!enc && SIZE_ONLY)
11939 ckWARNreg(p, "Invalid escape in the specified encoding");
11945 FAIL("Trailing \\");
11948 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
11949 /* Include any { following the alpha to emphasize
11950 * that it could be part of an escape at some point
11952 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
11953 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
11955 goto normal_default;
11956 } /* End of switch on '\' */
11958 default: /* A literal character */
11961 && RExC_flags & RXf_PMf_EXTENDED
11962 && ckWARN_d(WARN_DEPRECATED)
11963 && is_PATWS_non_low_safe(p, RExC_end, UTF))
11965 vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
11966 "Escape literal pattern white space under /x");
11970 if (UTF8_IS_START(*p) && UTF) {
11972 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
11973 &numlen, UTF8_ALLOW_DEFAULT);
11979 } /* End of switch on the literal */
11981 /* Here, have looked at the literal character and <ender>
11982 * contains its ordinal, <p> points to the character after it
11985 if ( RExC_flags & RXf_PMf_EXTENDED)
11986 p = regwhite( pRExC_state, p );
11988 /* If the next thing is a quantifier, it applies to this
11989 * character only, which means that this character has to be in
11990 * its own node and can't just be appended to the string in an
11991 * existing node, so if there are already other characters in
11992 * the node, close the node with just them, and set up to do
11993 * this character again next time through, when it will be the
11994 * only thing in its new node */
11995 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12001 if (! FOLD /* The simple case, just append the literal */
12002 || (LOC /* Also don't fold for tricky chars under /l */
12003 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12007 /* Normally, we don't need the representation of the
12008 * character in the sizing pass--just its size, but if
12009 * folding, we have to actually put the character out
12010 * even in the sizing pass, because the size could
12011 * change as we juggle things at the end of this loop
12012 * to avoid splitting a too-full node in the middle of
12013 * a potential multi-char fold [perl #123539] */
12014 const STRLEN unilen = (SIZE_ONLY && ! FOLD)
12016 : (uvchr_to_utf8((U8*)s, ender) - (U8*)s);
12022 /* The loop increments <len> each time, as all but this
12023 * path (and one other) through it add a single byte to
12024 * the EXACTish node. But this one has changed len to
12025 * be the correct final value, so subtract one to
12026 * cancel out the increment that follows */
12030 /* See comment above for [perl #123539] */
12031 *(s++) = (char) ender;
12034 REGC((char)ender, s++);
12037 /* Can get here if folding only if is one of the /l
12038 * characters whose fold depends on the locale. The
12039 * occurrence of any of these indicate that we can't
12040 * simplify things */
12042 maybe_exact = FALSE;
12043 maybe_exactfu = FALSE;
12048 /* See comments for join_exact() as to why we fold this
12049 * non-UTF at compile time */
12050 || (node_type == EXACTFU
12051 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12053 /* Here, are folding and are not UTF-8 encoded; therefore
12054 * the character must be in the range 0-255, and is not /l
12055 * (Not /l because we already handled these under /l in
12056 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12057 if (IS_IN_SOME_FOLD_L1(ender)) {
12058 maybe_exact = FALSE;
12060 /* See if the character's fold differs between /d and
12061 * /u. This includes the multi-char fold SHARP S to
12064 && (PL_fold[ender] != PL_fold_latin1[ender]
12065 || ender == LATIN_SMALL_LETTER_SHARP_S
12067 && isARG2_lower_or_UPPER_ARG1('s', ender)
12068 && isARG2_lower_or_UPPER_ARG1('s',
12071 maybe_exactfu = FALSE;
12075 /* Even when folding, we store just the input character, as
12076 * we have an array that finds its fold quickly */
12077 *(s++) = (char) ender;
12079 else { /* FOLD and UTF */
12080 /* Unlike the non-fold case, we do actually have to
12081 * calculate the results here in pass 1. This is for two
12082 * reasons, the folded length may be longer than the
12083 * unfolded, and we have to calculate how many EXACTish
12084 * nodes it will take; and we may run out of room in a node
12085 * in the middle of a potential multi-char fold, and have
12086 * to back off accordingly. (Hence we can't use REGC for
12087 * the simple case just below.) */
12090 if (isASCII(ender)) {
12091 folded = toFOLD(ender);
12092 *(s)++ = (U8) folded;
12097 folded = _to_uni_fold_flags(
12101 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12102 ? FOLD_FLAGS_NOMIX_ASCII
12106 /* The loop increments <len> each time, as all but this
12107 * path (and one other) through it add a single byte to
12108 * the EXACTish node. But this one has changed len to
12109 * be the correct final value, so subtract one to
12110 * cancel out the increment that follows */
12111 len += foldlen - 1;
12113 /* If this node only contains non-folding code points so
12114 * far, see if this new one is also non-folding */
12116 if (folded != ender) {
12117 maybe_exact = FALSE;
12120 /* Here the fold is the original; we have to check
12121 * further to see if anything folds to it */
12122 if (_invlist_contains_cp(PL_utf8_foldable,
12125 maybe_exact = FALSE;
12132 if (next_is_quantifier) {
12134 /* Here, the next input is a quantifier, and to get here,
12135 * the current character is the only one in the node.
12136 * Also, here <len> doesn't include the final byte for this
12142 } /* End of loop through literal characters */
12144 /* Here we have either exhausted the input or ran out of room in
12145 * the node. (If we encountered a character that can't be in the
12146 * node, transfer is made directly to <loopdone>, and so we
12147 * wouldn't have fallen off the end of the loop.) In the latter
12148 * case, we artificially have to split the node into two, because
12149 * we just don't have enough space to hold everything. This
12150 * creates a problem if the final character participates in a
12151 * multi-character fold in the non-final position, as a match that
12152 * should have occurred won't, due to the way nodes are matched,
12153 * and our artificial boundary. So back off until we find a non-
12154 * problematic character -- one that isn't at the beginning or
12155 * middle of such a fold. (Either it doesn't participate in any
12156 * folds, or appears only in the final position of all the folds it
12157 * does participate in.) A better solution with far fewer false
12158 * positives, and that would fill the nodes more completely, would
12159 * be to actually have available all the multi-character folds to
12160 * test against, and to back-off only far enough to be sure that
12161 * this node isn't ending with a partial one. <upper_parse> is set
12162 * further below (if we need to reparse the node) to include just
12163 * up through that final non-problematic character that this code
12164 * identifies, so when it is set to less than the full node, we can
12165 * skip the rest of this */
12166 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12168 const STRLEN full_len = len;
12170 assert(len >= MAX_NODE_STRING_SIZE);
12172 /* Here, <s> points to the final byte of the final character.
12173 * Look backwards through the string until find a non-
12174 * problematic character */
12178 /* This has no multi-char folds to non-UTF characters */
12179 if (ASCII_FOLD_RESTRICTED) {
12183 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12187 if (! PL_NonL1NonFinalFold) {
12188 PL_NonL1NonFinalFold = _new_invlist_C_array(
12189 NonL1_Perl_Non_Final_Folds_invlist);
12192 /* Point to the first byte of the final character */
12193 s = (char *) utf8_hop((U8 *) s, -1);
12195 while (s >= s0) { /* Search backwards until find
12196 non-problematic char */
12197 if (UTF8_IS_INVARIANT(*s)) {
12199 /* There are no ascii characters that participate
12200 * in multi-char folds under /aa. In EBCDIC, the
12201 * non-ascii invariants are all control characters,
12202 * so don't ever participate in any folds. */
12203 if (ASCII_FOLD_RESTRICTED
12204 || ! IS_NON_FINAL_FOLD(*s))
12209 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12210 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12216 else if (! _invlist_contains_cp(
12217 PL_NonL1NonFinalFold,
12218 valid_utf8_to_uvchr((U8 *) s, NULL)))
12223 /* Here, the current character is problematic in that
12224 * it does occur in the non-final position of some
12225 * fold, so try the character before it, but have to
12226 * special case the very first byte in the string, so
12227 * we don't read outside the string */
12228 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12229 } /* End of loop backwards through the string */
12231 /* If there were only problematic characters in the string,
12232 * <s> will point to before s0, in which case the length
12233 * should be 0, otherwise include the length of the
12234 * non-problematic character just found */
12235 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12238 /* Here, have found the final character, if any, that is
12239 * non-problematic as far as ending the node without splitting
12240 * it across a potential multi-char fold. <len> contains the
12241 * number of bytes in the node up-to and including that
12242 * character, or is 0 if there is no such character, meaning
12243 * the whole node contains only problematic characters. In
12244 * this case, give up and just take the node as-is. We can't
12249 /* If the node ends in an 's' we make sure it stays EXACTF,
12250 * as if it turns into an EXACTFU, it could later get
12251 * joined with another 's' that would then wrongly match
12253 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12255 maybe_exactfu = FALSE;
12259 /* Here, the node does contain some characters that aren't
12260 * problematic. If one such is the final character in the
12261 * node, we are done */
12262 if (len == full_len) {
12265 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12267 /* If the final character is problematic, but the
12268 * penultimate is not, back-off that last character to
12269 * later start a new node with it */
12274 /* Here, the final non-problematic character is earlier
12275 * in the input than the penultimate character. What we do
12276 * is reparse from the beginning, going up only as far as
12277 * this final ok one, thus guaranteeing that the node ends
12278 * in an acceptable character. The reason we reparse is
12279 * that we know how far in the character is, but we don't
12280 * know how to correlate its position with the input parse.
12281 * An alternate implementation would be to build that
12282 * correlation as we go along during the original parse,
12283 * but that would entail extra work for every node, whereas
12284 * this code gets executed only when the string is too
12285 * large for the node, and the final two characters are
12286 * problematic, an infrequent occurrence. Yet another
12287 * possible strategy would be to save the tail of the
12288 * string, and the next time regatom is called, initialize
12289 * with that. The problem with this is that unless you
12290 * back off one more character, you won't be guaranteed
12291 * regatom will get called again, unless regbranch,
12292 * regpiece ... are also changed. If you do back off that
12293 * extra character, so that there is input guaranteed to
12294 * force calling regatom, you can't handle the case where
12295 * just the first character in the node is acceptable. I
12296 * (khw) decided to try this method which doesn't have that
12297 * pitfall; if performance issues are found, we can do a
12298 * combination of the current approach plus that one */
12304 } /* End of verifying node ends with an appropriate char */
12306 loopdone: /* Jumped to when encounters something that shouldn't be in
12309 /* I (khw) don't know if you can get here with zero length, but the
12310 * old code handled this situation by creating a zero-length EXACT
12311 * node. Might as well be NOTHING instead */
12317 /* If 'maybe_exact' is still set here, means there are no
12318 * code points in the node that participate in folds;
12319 * similarly for 'maybe_exactfu' and code points that match
12320 * differently depending on UTF8ness of the target string
12321 * (for /u), or depending on locale for /l */
12325 else if (maybe_exactfu) {
12329 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12330 FALSE /* Don't look to see if could
12331 be turned into an EXACT
12332 node, as we have already
12337 RExC_parse = p - 1;
12338 Set_Node_Cur_Length(ret, parse_start);
12339 nextchar(pRExC_state);
12341 /* len is STRLEN which is unsigned, need to copy to signed */
12344 vFAIL("Internal disaster");
12347 } /* End of label 'defchar:' */
12349 } /* End of giant switch on input character */
12355 S_regwhite( RExC_state_t *pRExC_state, char *p )
12357 const char *e = RExC_end;
12359 PERL_ARGS_ASSERT_REGWHITE;
12364 else if (*p == '#') {
12367 if (*p++ == '\n') {
12373 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12382 S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12384 /* Returns the next non-pattern-white space, non-comment character (the
12385 * latter only if 'recognize_comment is true) in the string p, which is
12386 * ended by RExC_end. If there is no line break ending a comment,
12387 * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
12388 const char *e = RExC_end;
12390 PERL_ARGS_ASSERT_REGPATWS;
12394 if ((len = is_PATWS_safe(p, e, UTF))) {
12397 else if (recognize_comment && *p == '#') {
12401 if (is_LNBREAK_safe(p, e, UTF)) {
12407 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
12416 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12418 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12419 * sets up the bitmap and any flags, removing those code points from the
12420 * inversion list, setting it to NULL should it become completely empty */
12422 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12423 assert(PL_regkind[OP(node)] == ANYOF);
12425 ANYOF_BITMAP_ZERO(node);
12426 if (*invlist_ptr) {
12428 /* This gets set if we actually need to modify things */
12429 bool change_invlist = FALSE;
12433 /* Start looking through *invlist_ptr */
12434 invlist_iterinit(*invlist_ptr);
12435 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12439 if (end == UV_MAX && start <= 256) {
12440 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12442 else if (end >= 256) {
12443 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12446 /* Quit if are above what we should change */
12451 change_invlist = TRUE;
12453 /* Set all the bits in the range, up to the max that we are doing */
12454 high = (end < 255) ? end : 255;
12455 for (i = start; i <= (int) high; i++) {
12456 if (! ANYOF_BITMAP_TEST(node, i)) {
12457 ANYOF_BITMAP_SET(node, i);
12461 invlist_iterfinish(*invlist_ptr);
12463 /* Done with loop; remove any code points that are in the bitmap from
12464 * *invlist_ptr; similarly for code points above latin1 if we have a
12465 * flag to match all of them anyways */
12466 if (change_invlist) {
12467 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12469 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12470 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12473 /* If have completely emptied it, remove it completely */
12474 if (_invlist_len(*invlist_ptr) == 0) {
12475 SvREFCNT_dec_NN(*invlist_ptr);
12476 *invlist_ptr = NULL;
12481 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12482 Character classes ([:foo:]) can also be negated ([:^foo:]).
12483 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12484 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12485 but trigger failures because they are currently unimplemented. */
12487 #define POSIXCC_DONE(c) ((c) == ':')
12488 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12489 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12491 PERL_STATIC_INLINE I32
12492 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12495 I32 namedclass = OOB_NAMEDCLASS;
12497 PERL_ARGS_ASSERT_REGPPOSIXCC;
12499 if (value == '[' && RExC_parse + 1 < RExC_end &&
12500 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12501 POSIXCC(UCHARAT(RExC_parse)))
12503 const char c = UCHARAT(RExC_parse);
12504 char* const s = RExC_parse++;
12506 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12508 if (RExC_parse == RExC_end) {
12511 /* Try to give a better location for the error (than the end of
12512 * the string) by looking for the matching ']' */
12514 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12517 vFAIL2("Unmatched '%c' in POSIX class", c);
12519 /* Grandfather lone [:, [=, [. */
12523 const char* const t = RExC_parse++; /* skip over the c */
12526 if (UCHARAT(RExC_parse) == ']') {
12527 const char *posixcc = s + 1;
12528 RExC_parse++; /* skip over the ending ] */
12531 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12532 const I32 skip = t - posixcc;
12534 /* Initially switch on the length of the name. */
12537 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12538 this is the Perl \w
12540 namedclass = ANYOF_WORDCHAR;
12543 /* Names all of length 5. */
12544 /* alnum alpha ascii blank cntrl digit graph lower
12545 print punct space upper */
12546 /* Offset 4 gives the best switch position. */
12547 switch (posixcc[4]) {
12549 if (memEQ(posixcc, "alph", 4)) /* alpha */
12550 namedclass = ANYOF_ALPHA;
12553 if (memEQ(posixcc, "spac", 4)) /* space */
12554 namedclass = ANYOF_PSXSPC;
12557 if (memEQ(posixcc, "grap", 4)) /* graph */
12558 namedclass = ANYOF_GRAPH;
12561 if (memEQ(posixcc, "asci", 4)) /* ascii */
12562 namedclass = ANYOF_ASCII;
12565 if (memEQ(posixcc, "blan", 4)) /* blank */
12566 namedclass = ANYOF_BLANK;
12569 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12570 namedclass = ANYOF_CNTRL;
12573 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12574 namedclass = ANYOF_ALPHANUMERIC;
12577 if (memEQ(posixcc, "lowe", 4)) /* lower */
12578 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12579 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12580 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12583 if (memEQ(posixcc, "digi", 4)) /* digit */
12584 namedclass = ANYOF_DIGIT;
12585 else if (memEQ(posixcc, "prin", 4)) /* print */
12586 namedclass = ANYOF_PRINT;
12587 else if (memEQ(posixcc, "punc", 4)) /* punct */
12588 namedclass = ANYOF_PUNCT;
12593 if (memEQ(posixcc, "xdigit", 6))
12594 namedclass = ANYOF_XDIGIT;
12598 if (namedclass == OOB_NAMEDCLASS)
12600 "POSIX class [:%"UTF8f":] unknown",
12601 UTF8fARG(UTF, t - s - 1, s + 1));
12603 /* The #defines are structured so each complement is +1 to
12604 * the normal one */
12608 assert (posixcc[skip] == ':');
12609 assert (posixcc[skip+1] == ']');
12610 } else if (!SIZE_ONLY) {
12611 /* [[=foo=]] and [[.foo.]] are still future. */
12613 /* adjust RExC_parse so the warning shows after
12614 the class closes */
12615 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12617 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12620 /* Maternal grandfather:
12621 * "[:" ending in ":" but not in ":]" */
12623 vFAIL("Unmatched '[' in POSIX class");
12626 /* Grandfather lone [:, [=, [. */
12636 S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
12638 /* This applies some heuristics at the current parse position (which should
12639 * be at a '[') to see if what follows might be intended to be a [:posix:]
12640 * class. It returns true if it really is a posix class, of course, but it
12641 * also can return true if it thinks that what was intended was a posix
12642 * class that didn't quite make it.
12644 * It will return true for
12646 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12647 * ')' indicating the end of the (?[
12648 * [:any garbage including %^&$ punctuation:]
12650 * This is designed to be called only from S_handle_regex_sets; it could be
12651 * easily adapted to be called from the spot at the beginning of regclass()
12652 * that checks to see in a normal bracketed class if the surrounding []
12653 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12654 * change long-standing behavior, so I (khw) didn't do that */
12655 char* p = RExC_parse + 1;
12656 char first_char = *p;
12658 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12660 assert(*(p - 1) == '[');
12662 if (! POSIXCC(first_char)) {
12667 while (p < RExC_end && isWORDCHAR(*p)) p++;
12669 if (p >= RExC_end) {
12673 if (p - RExC_parse > 2 /* Got at least 1 word character */
12674 && (*p == first_char
12675 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12680 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12683 && p - RExC_parse > 2 /* [:] evaluates to colon;
12684 [::] is a bad posix class. */
12685 && first_char == *(p - 1));
12689 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12690 I32 *flagp, U32 depth,
12691 char * const oregcomp_parse)
12693 /* Handle the (?[...]) construct to do set operations */
12696 UV start, end; /* End points of code point ranges */
12698 char *save_end, *save_parse;
12703 const bool save_fold = FOLD;
12705 GET_RE_DEBUG_FLAGS_DECL;
12707 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12710 vFAIL("(?[...]) not valid in locale");
12712 RExC_uni_semantics = 1;
12714 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12715 * (such as EXACT). Thus we can skip most everything if just sizing. We
12716 * call regclass to handle '[]' so as to not have to reinvent its parsing
12717 * rules here (throwing away the size it computes each time). And, we exit
12718 * upon an unescaped ']' that isn't one ending a regclass. To do both
12719 * these things, we need to realize that something preceded by a backslash
12720 * is escaped, so we have to keep track of backslashes */
12722 UV depth = 0; /* how many nested (?[...]) constructs */
12724 Perl_ck_warner_d(aTHX_
12725 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12726 "The regex_sets feature is experimental" REPORT_LOCATION,
12727 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12729 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12730 RExC_precomp + (RExC_parse - RExC_precomp)));
12732 while (RExC_parse < RExC_end) {
12733 SV* current = NULL;
12734 RExC_parse = regpatws(pRExC_state, RExC_parse,
12735 TRUE); /* means recognize comments */
12736 switch (*RExC_parse) {
12738 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12743 /* Skip the next byte (which could cause us to end up in
12744 * the middle of a UTF-8 character, but since none of those
12745 * are confusable with anything we currently handle in this
12746 * switch (invariants all), it's safe. We'll just hit the
12747 * default: case next time and keep on incrementing until
12748 * we find one of the invariants we do handle. */
12753 /* If this looks like it is a [:posix:] class, leave the
12754 * parse pointer at the '[' to fool regclass() into
12755 * thinking it is part of a '[[:posix:]]'. That function
12756 * will use strict checking to force a syntax error if it
12757 * doesn't work out to a legitimate class */
12758 bool is_posix_class
12759 = could_it_be_a_POSIX_class(pRExC_state);
12760 if (! is_posix_class) {
12764 /* regclass() can only return RESTART_UTF8 if multi-char
12765 folds are allowed. */
12766 if (!regclass(pRExC_state, flagp,depth+1,
12767 is_posix_class, /* parse the whole char
12768 class only if not a
12770 FALSE, /* don't allow multi-char folds */
12771 TRUE, /* silence non-portable warnings. */
12773 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12776 /* function call leaves parse pointing to the ']', except
12777 * if we faked it */
12778 if (is_posix_class) {
12782 SvREFCNT_dec(current); /* In case it returned something */
12787 if (depth--) break;
12789 if (RExC_parse < RExC_end
12790 && *RExC_parse == ')')
12792 node = reganode(pRExC_state, ANYOF, 0);
12793 RExC_size += ANYOF_SKIP;
12794 nextchar(pRExC_state);
12795 Set_Node_Length(node,
12796 RExC_parse - oregcomp_parse + 1); /* MJD */
12805 FAIL("Syntax error in (?[...])");
12808 /* Pass 2 only after this. Everything in this construct is a
12809 * metacharacter. Operands begin with either a '\' (for an escape
12810 * sequence), or a '[' for a bracketed character class. Any other
12811 * character should be an operator, or parenthesis for grouping. Both
12812 * types of operands are handled by calling regclass() to parse them. It
12813 * is called with a parameter to indicate to return the computed inversion
12814 * list. The parsing here is implemented via a stack. Each entry on the
12815 * stack is a single character representing one of the operators, or the
12816 * '('; or else a pointer to an operand inversion list. */
12818 #define IS_OPERAND(a) (! SvIOK(a))
12820 /* The stack starts empty. It is a syntax error if the first thing parsed
12821 * is a binary operator; everything else is pushed on the stack. When an
12822 * operand is parsed, the top of the stack is examined. If it is a binary
12823 * operator, the item before it should be an operand, and both are replaced
12824 * by the result of doing that operation on the new operand and the one on
12825 * the stack. Thus a sequence of binary operands is reduced to a single
12826 * one before the next one is parsed.
12828 * A unary operator may immediately follow a binary in the input, for
12831 * When an operand is parsed and the top of the stack is a unary operator,
12832 * the operation is performed, and then the stack is rechecked to see if
12833 * this new operand is part of a binary operation; if so, it is handled as
12836 * A '(' is simply pushed on the stack; it is valid only if the stack is
12837 * empty, or the top element of the stack is an operator or another '('
12838 * (for which the parenthesized expression will become an operand). By the
12839 * time the corresponding ')' is parsed everything in between should have
12840 * been parsed and evaluated to a single operand (or else is a syntax
12841 * error), and is handled as a regular operand */
12843 sv_2mortal((SV *)(stack = newAV()));
12845 while (RExC_parse < RExC_end) {
12846 I32 top_index = av_tindex(stack);
12848 SV* current = NULL;
12850 /* Skip white space */
12851 RExC_parse = regpatws(pRExC_state, RExC_parse,
12852 TRUE); /* means recognize comments */
12853 if (RExC_parse >= RExC_end) {
12854 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12856 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12863 if (av_tindex(stack) >= 0 /* This makes sure that we can
12864 safely subtract 1 from
12865 RExC_parse in the next clause.
12866 If we have something on the
12867 stack, we have parsed something
12869 && UCHARAT(RExC_parse - 1) == '('
12870 && RExC_parse < RExC_end)
12872 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12873 * This happens when we have some thing like
12875 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12877 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12879 * Here we would be handling the interpolated
12880 * '$thai_or_lao'. We handle this by a recursive call to
12881 * ourselves which returns the inversion list the
12882 * interpolated expression evaluates to. We use the flags
12883 * from the interpolated pattern. */
12884 U32 save_flags = RExC_flags;
12885 const char * const save_parse = ++RExC_parse;
12887 parse_lparen_question_flags(pRExC_state);
12889 if (RExC_parse == save_parse /* Makes sure there was at
12890 least one flag (or this
12891 embedding wasn't compiled)
12893 || RExC_parse >= RExC_end - 4
12894 || UCHARAT(RExC_parse) != ':'
12895 || UCHARAT(++RExC_parse) != '('
12896 || UCHARAT(++RExC_parse) != '?'
12897 || UCHARAT(++RExC_parse) != '[')
12900 /* In combination with the above, this moves the
12901 * pointer to the point just after the first erroneous
12902 * character (or if there are no flags, to where they
12903 * should have been) */
12904 if (RExC_parse >= RExC_end - 4) {
12905 RExC_parse = RExC_end;
12907 else if (RExC_parse != save_parse) {
12908 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12910 vFAIL("Expecting '(?flags:(?[...'");
12913 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12914 depth+1, oregcomp_parse);
12916 /* Here, 'current' contains the embedded expression's
12917 * inversion list, and RExC_parse points to the trailing
12918 * ']'; the next character should be the ')' which will be
12919 * paired with the '(' that has been put on the stack, so
12920 * the whole embedded expression reduces to '(operand)' */
12923 RExC_flags = save_flags;
12924 goto handle_operand;
12929 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12930 vFAIL("Unexpected character");
12933 /* regclass() can only return RESTART_UTF8 if multi-char
12934 folds are allowed. */
12935 if (!regclass(pRExC_state, flagp,depth+1,
12936 TRUE, /* means parse just the next thing */
12937 FALSE, /* don't allow multi-char folds */
12938 FALSE, /* don't silence non-portable warnings. */
12940 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12942 /* regclass() will return with parsing just the \ sequence,
12943 * leaving the parse pointer at the next thing to parse */
12945 goto handle_operand;
12947 case '[': /* Is a bracketed character class */
12949 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12951 if (! is_posix_class) {
12955 /* regclass() can only return RESTART_UTF8 if multi-char
12956 folds are allowed. */
12957 if(!regclass(pRExC_state, flagp,depth+1,
12958 is_posix_class, /* parse the whole char class
12959 only if not a posix class */
12960 FALSE, /* don't allow multi-char folds */
12961 FALSE, /* don't silence non-portable warnings. */
12963 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12965 /* function call leaves parse pointing to the ']', except if we
12967 if (is_posix_class) {
12971 goto handle_operand;
12980 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
12981 || ! IS_OPERAND(*top_ptr))
12984 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
12986 av_push(stack, newSVuv(curchar));
12990 av_push(stack, newSVuv(curchar));
12994 if (top_index >= 0) {
12995 top_ptr = av_fetch(stack, top_index, FALSE);
12997 if (IS_OPERAND(*top_ptr)) {
12999 vFAIL("Unexpected '(' with no preceding operator");
13002 av_push(stack, newSVuv(curchar));
13009 || ! (current = av_pop(stack))
13010 || ! IS_OPERAND(current)
13011 || ! (lparen = av_pop(stack))
13012 || IS_OPERAND(lparen)
13013 || SvUV(lparen) != '(')
13015 SvREFCNT_dec(current);
13017 vFAIL("Unexpected ')'");
13020 SvREFCNT_dec_NN(lparen);
13027 /* Here, we have an operand to process, in 'current' */
13029 if (top_index < 0) { /* Just push if stack is empty */
13030 av_push(stack, current);
13033 SV* top = av_pop(stack);
13035 char current_operator;
13037 if (IS_OPERAND(top)) {
13038 SvREFCNT_dec_NN(top);
13039 SvREFCNT_dec_NN(current);
13040 vFAIL("Operand with no preceding operator");
13042 current_operator = (char) SvUV(top);
13043 switch (current_operator) {
13044 case '(': /* Push the '(' back on followed by the new
13046 av_push(stack, top);
13047 av_push(stack, current);
13048 SvREFCNT_inc(top); /* Counters the '_dec' done
13049 just after the 'break', so
13050 it doesn't get wrongly freed
13055 _invlist_invert(current);
13057 /* Unlike binary operators, the top of the stack,
13058 * now that this unary one has been popped off, may
13059 * legally be an operator, and we now have operand
13062 SvREFCNT_dec_NN(top);
13063 goto handle_operand;
13066 prev = av_pop(stack);
13067 _invlist_intersection(prev,
13070 av_push(stack, current);
13075 prev = av_pop(stack);
13076 _invlist_union(prev, current, ¤t);
13077 av_push(stack, current);
13081 prev = av_pop(stack);;
13082 _invlist_subtract(prev, current, ¤t);
13083 av_push(stack, current);
13086 case '^': /* The union minus the intersection */
13092 prev = av_pop(stack);
13093 _invlist_union(prev, current, &u);
13094 _invlist_intersection(prev, current, &i);
13095 /* _invlist_subtract will overwrite current
13096 without freeing what it already contains */
13098 _invlist_subtract(u, i, ¤t);
13099 av_push(stack, current);
13100 SvREFCNT_dec_NN(i);
13101 SvREFCNT_dec_NN(u);
13102 SvREFCNT_dec_NN(element);
13107 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13109 SvREFCNT_dec_NN(top);
13110 SvREFCNT_dec(prev);
13114 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13117 if (av_tindex(stack) < 0 /* Was empty */
13118 || ((final = av_pop(stack)) == NULL)
13119 || ! IS_OPERAND(final)
13120 || av_tindex(stack) >= 0) /* More left on stack */
13122 vFAIL("Incomplete expression within '(?[ ])'");
13125 /* Here, 'final' is the resultant inversion list from evaluating the
13126 * expression. Return it if so requested */
13127 if (return_invlist) {
13128 *return_invlist = final;
13132 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13133 * expecting a string of ranges and individual code points */
13134 invlist_iterinit(final);
13135 result_string = newSVpvs("");
13136 while (invlist_iternext(final, &start, &end)) {
13137 if (start == end) {
13138 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13141 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13146 save_parse = RExC_parse;
13147 RExC_parse = SvPV(result_string, len);
13148 save_end = RExC_end;
13149 RExC_end = RExC_parse + len;
13151 /* We turn off folding around the call, as the class we have constructed
13152 * already has all folding taken into consideration, and we don't want
13153 * regclass() to add to that */
13154 RExC_flags &= ~RXf_PMf_FOLD;
13155 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13157 node = regclass(pRExC_state, flagp,depth+1,
13158 FALSE, /* means parse the whole char class */
13159 FALSE, /* don't allow multi-char folds */
13160 TRUE, /* silence non-portable warnings. The above may very
13161 well have generated non-portable code points, but
13162 they're valid on this machine */
13165 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13168 RExC_flags |= RXf_PMf_FOLD;
13170 RExC_parse = save_parse + 1;
13171 RExC_end = save_end;
13172 SvREFCNT_dec_NN(final);
13173 SvREFCNT_dec_NN(result_string);
13175 nextchar(pRExC_state);
13176 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13181 /* The names of properties whose definitions are not known at compile time are
13182 * stored in this SV, after a constant heading. So if the length has been
13183 * changed since initialization, then there is a run-time definition. */
13184 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13185 (SvCUR(listsv) != initial_listsv_len)
13188 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13189 const bool stop_at_1, /* Just parse the next thing, don't
13190 look for a full character class */
13191 bool allow_multi_folds,
13192 const bool silence_non_portable, /* Don't output warnings
13195 SV** ret_invlist) /* Return an inversion list, not a node */
13197 /* parse a bracketed class specification. Most of these will produce an
13198 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13199 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13200 * under /i with multi-character folds: it will be rewritten following the
13201 * paradigm of this example, where the <multi-fold>s are characters which
13202 * fold to multiple character sequences:
13203 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13204 * gets effectively rewritten as:
13205 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13206 * reg() gets called (recursively) on the rewritten version, and this
13207 * function will return what it constructs. (Actually the <multi-fold>s
13208 * aren't physically removed from the [abcdefghi], it's just that they are
13209 * ignored in the recursion by means of a flag:
13210 * <RExC_in_multi_char_class>.)
13212 * ANYOF nodes contain a bit map for the first 256 characters, with the
13213 * corresponding bit set if that character is in the list. For characters
13214 * above 255, a range list or swash is used. There are extra bits for \w,
13215 * etc. in locale ANYOFs, as what these match is not determinable at
13218 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13219 * to be restarted. This can only happen if ret_invlist is non-NULL.
13223 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13225 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13228 IV namedclass = OOB_NAMEDCLASS;
13229 char *rangebegin = NULL;
13230 bool need_class = 0;
13232 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13233 than just initialized. */
13234 SV* properties = NULL; /* Code points that match \p{} \P{} */
13235 SV* posixes = NULL; /* Code points that match classes like [:word:],
13236 extended beyond the Latin1 range. These have to
13237 be kept separate from other code points for much
13238 of this function because their handling is
13239 different under /i, and for most classes under
13241 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13242 separate for a while from the non-complemented
13243 versions because of complications with /d
13245 UV element_count = 0; /* Number of distinct elements in the class.
13246 Optimizations may be possible if this is tiny */
13247 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13248 character; used under /i */
13250 char * stop_ptr = RExC_end; /* where to stop parsing */
13251 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13253 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13255 /* Unicode properties are stored in a swash; this holds the current one
13256 * being parsed. If this swash is the only above-latin1 component of the
13257 * character class, an optimization is to pass it directly on to the
13258 * execution engine. Otherwise, it is set to NULL to indicate that there
13259 * are other things in the class that have to be dealt with at execution
13261 SV* swash = NULL; /* Code points that match \p{} \P{} */
13263 /* Set if a component of this character class is user-defined; just passed
13264 * on to the engine */
13265 bool has_user_defined_property = FALSE;
13267 /* inversion list of code points this node matches only when the target
13268 * string is in UTF-8. (Because is under /d) */
13269 SV* depends_list = NULL;
13271 /* Inversion list of code points this node matches regardless of things
13272 * like locale, folding, utf8ness of the target string */
13273 SV* cp_list = NULL;
13275 /* Like cp_list, but code points on this list need to be checked for things
13276 * that fold to/from them under /i */
13277 SV* cp_foldable_list = NULL;
13279 /* Like cp_list, but code points on this list are valid only when the
13280 * runtime locale is UTF-8 */
13281 SV* only_utf8_locale_list = NULL;
13284 /* In a range, counts how many 0-2 of the ends of it came from literals,
13285 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13286 UV literal_endpoint = 0;
13288 bool invert = FALSE; /* Is this class to be complemented */
13290 bool warn_super = ALWAYS_WARN_SUPER;
13292 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13293 case we need to change the emitted regop to an EXACT. */
13294 const char * orig_parse = RExC_parse;
13295 const SSize_t orig_size = RExC_size;
13296 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13297 GET_RE_DEBUG_FLAGS_DECL;
13299 PERL_ARGS_ASSERT_REGCLASS;
13301 PERL_UNUSED_ARG(depth);
13304 DEBUG_PARSE("clas");
13306 /* Assume we are going to generate an ANYOF node. */
13307 ret = reganode(pRExC_state, ANYOF, 0);
13310 RExC_size += ANYOF_SKIP;
13311 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13314 ANYOF_FLAGS(ret) = 0;
13316 RExC_emit += ANYOF_SKIP;
13317 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13318 initial_listsv_len = SvCUR(listsv);
13319 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13323 RExC_parse = regpatws(pRExC_state, RExC_parse,
13324 FALSE /* means don't recognize comments */);
13327 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13330 allow_multi_folds = FALSE;
13333 RExC_parse = regpatws(pRExC_state, RExC_parse,
13334 FALSE /* means don't recognize comments */);
13338 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13339 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13340 const char *s = RExC_parse;
13341 const char c = *s++;
13343 while (isWORDCHAR(*s))
13345 if (*s && c == *s && s[1] == ']') {
13346 SAVEFREESV(RExC_rx_sv);
13348 "POSIX syntax [%c %c] belongs inside character classes",
13350 (void)ReREFCNT_inc(RExC_rx_sv);
13354 /* If the caller wants us to just parse a single element, accomplish this
13355 * by faking the loop ending condition */
13356 if (stop_at_1 && RExC_end > RExC_parse) {
13357 stop_ptr = RExC_parse + 1;
13360 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13361 if (UCHARAT(RExC_parse) == ']')
13362 goto charclassloop;
13366 if (RExC_parse >= stop_ptr) {
13371 RExC_parse = regpatws(pRExC_state, RExC_parse,
13372 FALSE /* means don't recognize comments */);
13375 if (UCHARAT(RExC_parse) == ']') {
13381 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13382 save_value = value;
13383 save_prevvalue = prevvalue;
13386 rangebegin = RExC_parse;
13390 value = utf8n_to_uvchr((U8*)RExC_parse,
13391 RExC_end - RExC_parse,
13392 &numlen, UTF8_ALLOW_DEFAULT);
13393 RExC_parse += numlen;
13396 value = UCHARAT(RExC_parse++);
13399 && RExC_parse < RExC_end
13400 && POSIXCC(UCHARAT(RExC_parse)))
13402 namedclass = regpposixcc(pRExC_state, value, strict);
13404 else if (value == '\\') {
13406 value = utf8n_to_uvchr((U8*)RExC_parse,
13407 RExC_end - RExC_parse,
13408 &numlen, UTF8_ALLOW_DEFAULT);
13409 RExC_parse += numlen;
13412 value = UCHARAT(RExC_parse++);
13414 /* Some compilers cannot handle switching on 64-bit integer
13415 * values, therefore value cannot be an UV. Yes, this will
13416 * be a problem later if we want switch on Unicode.
13417 * A similar issue a little bit later when switching on
13418 * namedclass. --jhi */
13420 /* If the \ is escaping white space when white space is being
13421 * skipped, it means that that white space is wanted literally, and
13422 * is already in 'value'. Otherwise, need to translate the escape
13423 * into what it signifies. */
13424 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13426 case 'w': namedclass = ANYOF_WORDCHAR; break;
13427 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13428 case 's': namedclass = ANYOF_SPACE; break;
13429 case 'S': namedclass = ANYOF_NSPACE; break;
13430 case 'd': namedclass = ANYOF_DIGIT; break;
13431 case 'D': namedclass = ANYOF_NDIGIT; break;
13432 case 'v': namedclass = ANYOF_VERTWS; break;
13433 case 'V': namedclass = ANYOF_NVERTWS; break;
13434 case 'h': namedclass = ANYOF_HORIZWS; break;
13435 case 'H': namedclass = ANYOF_NHORIZWS; break;
13436 case 'N': /* Handle \N{NAME} in class */
13438 /* We only pay attention to the first char of
13439 multichar strings being returned. I kinda wonder
13440 if this makes sense as it does change the behaviour
13441 from earlier versions, OTOH that behaviour was broken
13443 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13444 TRUE, /* => charclass */
13447 if (*flagp & RESTART_UTF8)
13448 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13458 /* We will handle any undefined properties ourselves */
13459 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13460 /* And we actually would prefer to get
13461 * the straight inversion list of the
13462 * swash, since we will be accessing it
13463 * anyway, to save a little time */
13464 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13466 if (RExC_parse >= RExC_end)
13467 vFAIL2("Empty \\%c{}", (U8)value);
13468 if (*RExC_parse == '{') {
13469 const U8 c = (U8)value;
13470 e = strchr(RExC_parse++, '}');
13472 vFAIL2("Missing right brace on \\%c{}", c);
13473 while (isSPACE(UCHARAT(RExC_parse)))
13475 if (e == RExC_parse)
13476 vFAIL2("Empty \\%c{}", c);
13477 n = e - RExC_parse;
13478 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
13490 if (UCHARAT(RExC_parse) == '^') {
13493 /* toggle. (The rhs xor gets the single bit that
13494 * differs between P and p; the other xor inverts just
13496 value ^= 'P' ^ 'p';
13498 while (isSPACE(UCHARAT(RExC_parse))) {
13503 /* Try to get the definition of the property into
13504 * <invlist>. If /i is in effect, the effective property
13505 * will have its name be <__NAME_i>. The design is
13506 * discussed in commit
13507 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13508 formatted = Perl_form(aTHX_
13510 (FOLD) ? "__" : "",
13515 name = savepvn(formatted, strlen(formatted));
13517 /* Look up the property name, and get its swash and
13518 * inversion list, if the property is found */
13520 SvREFCNT_dec_NN(swash);
13522 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13525 NULL, /* No inversion list */
13528 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13530 SvREFCNT_dec_NN(swash);
13534 /* Here didn't find it. It could be a user-defined
13535 * property that will be available at run-time. If we
13536 * accept only compile-time properties, is an error;
13537 * otherwise add it to the list for run-time look up */
13539 RExC_parse = e + 1;
13541 "Property '%"UTF8f"' is unknown",
13542 UTF8fARG(UTF, n, name));
13544 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13545 (value == 'p' ? '+' : '!'),
13546 UTF8fARG(UTF, n, name));
13547 has_user_defined_property = TRUE;
13549 /* We don't know yet, so have to assume that the
13550 * property could match something in the Latin1 range,
13551 * hence something that isn't utf8. Note that this
13552 * would cause things in <depends_list> to match
13553 * inappropriately, except that any \p{}, including
13554 * this one forces Unicode semantics, which means there
13555 * is no <depends_list> */
13556 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13560 /* Here, did get the swash and its inversion list. If
13561 * the swash is from a user-defined property, then this
13562 * whole character class should be regarded as such */
13563 if (swash_init_flags
13564 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13566 has_user_defined_property = TRUE;
13569 /* We warn on matching an above-Unicode code point
13570 * if the match would return true, except don't
13571 * warn for \p{All}, which has exactly one element
13573 (_invlist_contains_cp(invlist, 0x110000)
13574 && (! (_invlist_len(invlist) == 1
13575 && *invlist_array(invlist) == 0)))
13581 /* Invert if asking for the complement */
13582 if (value == 'P') {
13583 _invlist_union_complement_2nd(properties,
13587 /* The swash can't be used as-is, because we've
13588 * inverted things; delay removing it to here after
13589 * have copied its invlist above */
13590 SvREFCNT_dec_NN(swash);
13594 _invlist_union(properties, invlist, &properties);
13599 RExC_parse = e + 1;
13600 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13603 /* \p means they want Unicode semantics */
13604 RExC_uni_semantics = 1;
13607 case 'n': value = '\n'; break;
13608 case 'r': value = '\r'; break;
13609 case 't': value = '\t'; break;
13610 case 'f': value = '\f'; break;
13611 case 'b': value = '\b'; break;
13612 case 'e': value = ASCII_TO_NATIVE('\033');break;
13613 case 'a': value = '\a'; break;
13615 RExC_parse--; /* function expects to be pointed at the 'o' */
13617 const char* error_msg;
13618 bool valid = grok_bslash_o(&RExC_parse,
13621 SIZE_ONLY, /* warnings in pass
13624 silence_non_portable,
13630 if (PL_encoding && value < 0x100) {
13631 goto recode_encoding;
13635 RExC_parse--; /* function expects to be pointed at the 'x' */
13637 const char* error_msg;
13638 bool valid = grok_bslash_x(&RExC_parse,
13641 TRUE, /* Output warnings */
13643 silence_non_portable,
13649 if (PL_encoding && value < 0x100)
13650 goto recode_encoding;
13653 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13655 case '0': case '1': case '2': case '3': case '4':
13656 case '5': case '6': case '7':
13658 /* Take 1-3 octal digits */
13659 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13660 numlen = (strict) ? 4 : 3;
13661 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13662 RExC_parse += numlen;
13665 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13666 vFAIL("Need exactly 3 octal digits");
13668 else if (! SIZE_ONLY /* like \08, \178 */
13670 && RExC_parse < RExC_end
13671 && isDIGIT(*RExC_parse)
13672 && ckWARN(WARN_REGEXP))
13674 SAVEFREESV(RExC_rx_sv);
13675 reg_warn_non_literal_string(
13677 form_short_octal_warning(RExC_parse, numlen));
13678 (void)ReREFCNT_inc(RExC_rx_sv);
13681 if (PL_encoding && value < 0x100)
13682 goto recode_encoding;
13686 if (! RExC_override_recoding) {
13687 SV* enc = PL_encoding;
13688 value = reg_recode((const char)(U8)value, &enc);
13691 vFAIL("Invalid escape in the specified encoding");
13693 else if (SIZE_ONLY) {
13694 ckWARNreg(RExC_parse,
13695 "Invalid escape in the specified encoding");
13701 /* Allow \_ to not give an error */
13702 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13704 vFAIL2("Unrecognized escape \\%c in character class",
13708 SAVEFREESV(RExC_rx_sv);
13709 ckWARN2reg(RExC_parse,
13710 "Unrecognized escape \\%c in character class passed through",
13712 (void)ReREFCNT_inc(RExC_rx_sv);
13716 } /* End of switch on char following backslash */
13717 } /* end of handling backslash escape sequences */
13720 literal_endpoint++;
13723 /* Here, we have the current token in 'value' */
13725 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13728 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13729 * literal, as is the character that began the false range, i.e.
13730 * the 'a' in the examples */
13733 const int w = (RExC_parse >= rangebegin)
13734 ? RExC_parse - rangebegin
13738 "False [] range \"%"UTF8f"\"",
13739 UTF8fARG(UTF, w, rangebegin));
13742 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13743 ckWARN2reg(RExC_parse,
13744 "False [] range \"%"UTF8f"\"",
13745 UTF8fARG(UTF, w, rangebegin));
13746 (void)ReREFCNT_inc(RExC_rx_sv);
13747 cp_list = add_cp_to_invlist(cp_list, '-');
13748 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13753 range = 0; /* this was not a true range */
13754 element_count += 2; /* So counts for three values */
13757 classnum = namedclass_to_classnum(namedclass);
13759 if (LOC && namedclass < ANYOF_POSIXL_MAX
13760 #ifndef HAS_ISASCII
13761 && classnum != _CC_ASCII
13764 /* What the Posix classes (like \w, [:space:]) match in locale
13765 * isn't knowable under locale until actual match time. Room
13766 * must be reserved (one time per outer bracketed class) to
13767 * store such classes. The space will contain a bit for each
13768 * named class that is to be matched against. This isn't
13769 * needed for \p{} and pseudo-classes, as they are not affected
13770 * by locale, and hence are dealt with separately */
13771 if (! need_class) {
13774 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13777 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13779 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13780 ANYOF_POSIXL_ZERO(ret);
13783 /* See if it already matches the complement of this POSIX
13785 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13786 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13790 posixl_matches_all = TRUE;
13791 break; /* No need to continue. Since it matches both
13792 e.g., \w and \W, it matches everything, and the
13793 bracketed class can be optimized into qr/./s */
13796 /* Add this class to those that should be checked at runtime */
13797 ANYOF_POSIXL_SET(ret, namedclass);
13799 /* The above-Latin1 characters are not subject to locale rules.
13800 * Just add them, in the second pass, to the
13801 * unconditionally-matched list */
13803 SV* scratch_list = NULL;
13805 /* Get the list of the above-Latin1 code points this
13807 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13808 PL_XPosix_ptrs[classnum],
13810 /* Odd numbers are complements, like
13811 * NDIGIT, NASCII, ... */
13812 namedclass % 2 != 0,
13814 /* Checking if 'cp_list' is NULL first saves an extra
13815 * clone. Its reference count will be decremented at the
13816 * next union, etc, or if this is the only instance, at the
13817 * end of the routine */
13819 cp_list = scratch_list;
13822 _invlist_union(cp_list, scratch_list, &cp_list);
13823 SvREFCNT_dec_NN(scratch_list);
13825 continue; /* Go get next character */
13828 else if (! SIZE_ONLY) {
13830 /* Here, not in pass1 (in that pass we skip calculating the
13831 * contents of this class), and is /l, or is a POSIX class for
13832 * which /l doesn't matter (or is a Unicode property, which is
13833 * skipped here). */
13834 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13835 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13837 /* Here, should be \h, \H, \v, or \V. None of /d, /i
13838 * nor /l make a difference in what these match,
13839 * therefore we just add what they match to cp_list. */
13840 if (classnum != _CC_VERTSPACE) {
13841 assert( namedclass == ANYOF_HORIZWS
13842 || namedclass == ANYOF_NHORIZWS);
13844 /* It turns out that \h is just a synonym for
13846 classnum = _CC_BLANK;
13849 _invlist_union_maybe_complement_2nd(
13851 PL_XPosix_ptrs[classnum],
13852 namedclass % 2 != 0, /* Complement if odd
13853 (NHORIZWS, NVERTWS)
13858 else { /* Garden variety class. If is NASCII, NDIGIT, ...
13859 complement and use nposixes */
13860 SV** posixes_ptr = namedclass % 2 == 0
13863 SV** source_ptr = &PL_XPosix_ptrs[classnum];
13864 _invlist_union_maybe_complement_2nd(
13867 namedclass % 2 != 0,
13870 continue; /* Go get next character */
13872 } /* end of namedclass \blah */
13874 /* Here, we have a single value. If 'range' is set, it is the ending
13875 * of a range--check its validity. Later, we will handle each
13876 * individual code point in the range. If 'range' isn't set, this
13877 * could be the beginning of a range, so check for that by looking
13878 * ahead to see if the next real character to be processed is the range
13879 * indicator--the minus sign */
13882 RExC_parse = regpatws(pRExC_state, RExC_parse,
13883 FALSE /* means don't recognize comments */);
13887 if (prevvalue > value) /* b-a */ {
13888 const int w = RExC_parse - rangebegin;
13890 "Invalid [] range \"%"UTF8f"\"",
13891 UTF8fARG(UTF, w, rangebegin));
13892 range = 0; /* not a valid range */
13896 prevvalue = value; /* save the beginning of the potential range */
13897 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13898 && *RExC_parse == '-')
13900 char* next_char_ptr = RExC_parse + 1;
13901 if (skip_white) { /* Get the next real char after the '-' */
13902 next_char_ptr = regpatws(pRExC_state,
13904 FALSE); /* means don't recognize
13908 /* If the '-' is at the end of the class (just before the ']',
13909 * it is a literal minus; otherwise it is a range */
13910 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
13911 RExC_parse = next_char_ptr;
13913 /* a bad range like \w-, [:word:]- ? */
13914 if (namedclass > OOB_NAMEDCLASS) {
13915 if (strict || ckWARN(WARN_REGEXP)) {
13917 RExC_parse >= rangebegin ?
13918 RExC_parse - rangebegin : 0;
13920 vFAIL4("False [] range \"%*.*s\"",
13925 "False [] range \"%*.*s\"",
13930 cp_list = add_cp_to_invlist(cp_list, '-');
13934 range = 1; /* yeah, it's a range! */
13935 continue; /* but do it the next time */
13940 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
13943 /* non-Latin1 code point implies unicode semantics. Must be set in
13944 * pass1 so is there for the whole of pass 2 */
13946 RExC_uni_semantics = 1;
13949 /* Ready to process either the single value, or the completed range.
13950 * For single-valued non-inverted ranges, we consider the possibility
13951 * of multi-char folds. (We made a conscious decision to not do this
13952 * for the other cases because it can often lead to non-intuitive
13953 * results. For example, you have the peculiar case that:
13954 * "s s" =~ /^[^\xDF]+$/i => Y
13955 * "ss" =~ /^[^\xDF]+$/i => N
13957 * See [perl #89750] */
13958 if (FOLD && allow_multi_folds && value == prevvalue) {
13959 if (value == LATIN_SMALL_LETTER_SHARP_S
13960 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
13963 /* Here <value> is indeed a multi-char fold. Get what it is */
13965 U8 foldbuf[UTF8_MAXBYTES_CASE];
13968 UV folded = _to_uni_fold_flags(
13972 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
13973 ? FOLD_FLAGS_NOMIX_ASCII
13977 /* Here, <folded> should be the first character of the
13978 * multi-char fold of <value>, with <foldbuf> containing the
13979 * whole thing. But, if this fold is not allowed (because of
13980 * the flags), <fold> will be the same as <value>, and should
13981 * be processed like any other character, so skip the special
13983 if (folded != value) {
13985 /* Skip if we are recursed, currently parsing the class
13986 * again. Otherwise add this character to the list of
13987 * multi-char folds. */
13988 if (! RExC_in_multi_char_class) {
13989 AV** this_array_ptr;
13991 STRLEN cp_count = utf8_length(foldbuf,
13992 foldbuf + foldlen);
13993 SV* multi_fold = sv_2mortal(newSVpvn("", 0));
13995 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
13998 if (! multi_char_matches) {
13999 multi_char_matches = newAV();
14002 /* <multi_char_matches> is actually an array of arrays.
14003 * There will be one or two top-level elements: [2],
14004 * and/or [3]. The [2] element is an array, each
14005 * element thereof is a character which folds to TWO
14006 * characters; [3] is for folds to THREE characters.
14007 * (Unicode guarantees a maximum of 3 characters in any
14008 * fold.) When we rewrite the character class below,
14009 * we will do so such that the longest folds are
14010 * written first, so that it prefers the longest
14011 * matching strings first. This is done even if it
14012 * turns out that any quantifier is non-greedy, out of
14013 * programmer laziness. Tom Christiansen has agreed
14014 * that this is ok. This makes the test for the
14015 * ligature 'ffi' come before the test for 'ff' */
14016 if (av_exists(multi_char_matches, cp_count)) {
14017 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14019 this_array = *this_array_ptr;
14022 this_array = newAV();
14023 av_store(multi_char_matches, cp_count,
14026 av_push(this_array, multi_fold);
14029 /* This element should not be processed further in this
14032 value = save_value;
14033 prevvalue = save_prevvalue;
14039 /* Deal with this element of the class */
14042 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14045 SV* this_range = _new_invlist(1);
14046 _append_range_to_invlist(this_range, prevvalue, value);
14048 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14049 * If this range was specified using something like 'i-j', we want
14050 * to include only the 'i' and the 'j', and not anything in
14051 * between, so exclude non-ASCII, non-alphabetics from it.
14052 * However, if the range was specified with something like
14053 * [\x89-\x91] or [\x89-j], all code points within it should be
14054 * included. literal_endpoint==2 means both ends of the range used
14055 * a literal character, not \x{foo} */
14056 if (literal_endpoint == 2
14057 && ((prevvalue >= 'a' && value <= 'z')
14058 || (prevvalue >= 'A' && value <= 'Z')))
14060 _invlist_intersection(this_range, PL_ASCII,
14063 /* Since this above only contains ascii, the intersection of it
14064 * with anything will still yield only ascii */
14065 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14068 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14069 literal_endpoint = 0;
14073 range = 0; /* this range (if it was one) is done now */
14074 } /* End of loop through all the text within the brackets */
14076 /* If anything in the class expands to more than one character, we have to
14077 * deal with them by building up a substitute parse string, and recursively
14078 * calling reg() on it, instead of proceeding */
14079 if (multi_char_matches) {
14080 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14083 char *save_end = RExC_end;
14084 char *save_parse = RExC_parse;
14085 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14090 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14091 because too confusing */
14093 sv_catpv(substitute_parse, "(?:");
14097 /* Look at the longest folds first */
14098 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14100 if (av_exists(multi_char_matches, cp_count)) {
14101 AV** this_array_ptr;
14104 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14106 while ((this_sequence = av_pop(*this_array_ptr)) !=
14109 if (! first_time) {
14110 sv_catpv(substitute_parse, "|");
14112 first_time = FALSE;
14114 sv_catpv(substitute_parse, SvPVX(this_sequence));
14119 /* If the character class contains anything else besides these
14120 * multi-character folds, have to include it in recursive parsing */
14121 if (element_count) {
14122 sv_catpv(substitute_parse, "|[");
14123 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14124 sv_catpv(substitute_parse, "]");
14127 sv_catpv(substitute_parse, ")");
14130 /* This is a way to get the parse to skip forward a whole named
14131 * sequence instead of matching the 2nd character when it fails the
14133 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14137 RExC_parse = SvPV(substitute_parse, len);
14138 RExC_end = RExC_parse + len;
14139 RExC_in_multi_char_class = 1;
14140 RExC_emit = (regnode *)orig_emit;
14142 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14144 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14146 RExC_parse = save_parse;
14147 RExC_end = save_end;
14148 RExC_in_multi_char_class = 0;
14149 SvREFCNT_dec_NN(multi_char_matches);
14153 /* Here, we've gone through the entire class and dealt with multi-char
14154 * folds. We are now in a position that we can do some checks to see if we
14155 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14156 * Currently we only do two checks:
14157 * 1) is in the unlikely event that the user has specified both, eg. \w and
14158 * \W under /l, then the class matches everything. (This optimization
14159 * is done only to make the optimizer code run later work.)
14160 * 2) if the character class contains only a single element (including a
14161 * single range), we see if there is an equivalent node for it.
14162 * Other checks are possible */
14163 if (! ret_invlist /* Can't optimize if returning the constructed
14165 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14170 if (UNLIKELY(posixl_matches_all)) {
14173 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14174 \w or [:digit:] or \p{foo}
14177 /* All named classes are mapped into POSIXish nodes, with its FLAG
14178 * argument giving which class it is */
14179 switch ((I32)namedclass) {
14180 case ANYOF_UNIPROP:
14183 /* These don't depend on the charset modifiers. They always
14184 * match under /u rules */
14185 case ANYOF_NHORIZWS:
14186 case ANYOF_HORIZWS:
14187 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14190 case ANYOF_NVERTWS:
14195 /* The actual POSIXish node for all the rest depends on the
14196 * charset modifier. The ones in the first set depend only on
14197 * ASCII or, if available on this platform, locale */
14201 op = (LOC) ? POSIXL : POSIXA;
14212 /* under /a could be alpha */
14214 if (ASCII_RESTRICTED) {
14215 namedclass = ANYOF_ALPHA + (namedclass % 2);
14223 /* The rest have more possibilities depending on the charset.
14224 * We take advantage of the enum ordering of the charset
14225 * modifiers to get the exact node type, */
14227 op = POSIXD + get_regex_charset(RExC_flags);
14228 if (op > POSIXA) { /* /aa is same as /a */
14233 /* The odd numbered ones are the complements of the
14234 * next-lower even number one */
14235 if (namedclass % 2 == 1) {
14239 arg = namedclass_to_classnum(namedclass);
14243 else if (value == prevvalue) {
14245 /* Here, the class consists of just a single code point */
14248 if (! LOC && value == '\n') {
14249 op = REG_ANY; /* Optimize [^\n] */
14250 *flagp |= HASWIDTH|SIMPLE;
14254 else if (value < 256 || UTF) {
14256 /* Optimize a single value into an EXACTish node, but not if it
14257 * would require converting the pattern to UTF-8. */
14258 op = compute_EXACTish(pRExC_state);
14260 } /* Otherwise is a range */
14261 else if (! LOC) { /* locale could vary these */
14262 if (prevvalue == '0') {
14263 if (value == '9') {
14270 /* Here, we have changed <op> away from its initial value iff we found
14271 * an optimization */
14274 /* Throw away this ANYOF regnode, and emit the calculated one,
14275 * which should correspond to the beginning, not current, state of
14277 const char * cur_parse = RExC_parse;
14278 RExC_parse = (char *)orig_parse;
14282 /* To get locale nodes to not use the full ANYOF size would
14283 * require moving the code above that writes the portions
14284 * of it that aren't in other nodes to after this point.
14285 * e.g. ANYOF_POSIXL_SET */
14286 RExC_size = orig_size;
14290 RExC_emit = (regnode *)orig_emit;
14291 if (PL_regkind[op] == POSIXD) {
14292 if (op == POSIXL) {
14293 RExC_contains_locale = 1;
14296 op += NPOSIXD - POSIXD;
14301 ret = reg_node(pRExC_state, op);
14303 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14307 *flagp |= HASWIDTH|SIMPLE;
14309 else if (PL_regkind[op] == EXACT) {
14310 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14311 TRUE /* downgradable to EXACT */
14315 RExC_parse = (char *) cur_parse;
14317 SvREFCNT_dec(posixes);
14318 SvREFCNT_dec(nposixes);
14319 SvREFCNT_dec(cp_list);
14320 SvREFCNT_dec(cp_foldable_list);
14327 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14329 /* If folding, we calculate all characters that could fold to or from the
14330 * ones already on the list */
14331 if (cp_foldable_list) {
14333 UV start, end; /* End points of code point ranges */
14335 SV* fold_intersection = NULL;
14338 /* Our calculated list will be for Unicode rules. For locale
14339 * matching, we have to keep a separate list that is consulted at
14340 * runtime only when the locale indicates Unicode rules. For
14341 * non-locale, we just use to the general list */
14343 use_list = &only_utf8_locale_list;
14346 use_list = &cp_list;
14349 /* Only the characters in this class that participate in folds need
14350 * be checked. Get the intersection of this class and all the
14351 * possible characters that are foldable. This can quickly narrow
14352 * down a large class */
14353 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14354 &fold_intersection);
14356 /* The folds for all the Latin1 characters are hard-coded into this
14357 * program, but we have to go out to disk to get the others. */
14358 if (invlist_highest(cp_foldable_list) >= 256) {
14360 /* This is a hash that for a particular fold gives all
14361 * characters that are involved in it */
14362 if (! PL_utf8_foldclosures) {
14364 /* If the folds haven't been read in, call a fold function
14366 if (! PL_utf8_tofold) {
14367 U8 dummy[UTF8_MAXBYTES_CASE+1];
14369 /* This string is just a short named one above \xff */
14370 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
14371 assert(PL_utf8_tofold); /* Verify that worked */
14373 PL_utf8_foldclosures
14374 = _swash_inversion_hash(PL_utf8_tofold);
14378 /* Now look at the foldable characters in this class individually */
14379 invlist_iterinit(fold_intersection);
14380 while (invlist_iternext(fold_intersection, &start, &end)) {
14383 /* Look at every character in the range */
14384 for (j = start; j <= end; j++) {
14385 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14391 /* We have the latin1 folding rules hard-coded here so
14392 * that an innocent-looking character class, like
14393 * /[ks]/i won't have to go out to disk to find the
14394 * possible matches. XXX It would be better to
14395 * generate these via regen, in case a new version of
14396 * the Unicode standard adds new mappings, though that
14397 * is not really likely, and may be caught by the
14398 * default: case of the switch below. */
14400 if (IS_IN_SOME_FOLD_L1(j)) {
14402 /* ASCII is always matched; non-ASCII is matched
14403 * only under Unicode rules (which could happen
14404 * under /l if the locale is a UTF-8 one */
14405 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14406 *use_list = add_cp_to_invlist(*use_list,
14407 PL_fold_latin1[j]);
14411 add_cp_to_invlist(depends_list,
14412 PL_fold_latin1[j]);
14416 if (HAS_NONLATIN1_FOLD_CLOSURE(j)
14417 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14419 /* Certain Latin1 characters have matches outside
14420 * Latin1. To get here, <j> is one of those
14421 * characters. None of these matches is valid for
14422 * ASCII characters under /aa, which is why the 'if'
14423 * just above excludes those. These matches only
14424 * happen when the target string is utf8. The code
14425 * below adds the single fold closures for <j> to the
14426 * inversion list. */
14432 add_cp_to_invlist(*use_list, KELVIN_SIGN);
14436 *use_list = add_cp_to_invlist(*use_list,
14437 LATIN_SMALL_LETTER_LONG_S);
14440 *use_list = add_cp_to_invlist(*use_list,
14441 GREEK_CAPITAL_LETTER_MU);
14442 *use_list = add_cp_to_invlist(*use_list,
14443 GREEK_SMALL_LETTER_MU);
14445 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14446 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14448 add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
14450 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14451 *use_list = add_cp_to_invlist(*use_list,
14452 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14454 case LATIN_SMALL_LETTER_SHARP_S:
14455 *use_list = add_cp_to_invlist(*use_list,
14456 LATIN_CAPITAL_LETTER_SHARP_S);
14458 case 'F': case 'f':
14459 case 'I': case 'i':
14460 case 'L': case 'l':
14461 case 'T': case 't':
14462 case 'A': case 'a':
14463 case 'H': case 'h':
14464 case 'J': case 'j':
14465 case 'N': case 'n':
14466 case 'W': case 'w':
14467 case 'Y': case 'y':
14468 /* These all are targets of multi-character
14469 * folds from code points that require UTF8
14470 * to express, so they can't match unless
14471 * the target string is in UTF-8, so no
14472 * action here is necessary, as regexec.c
14473 * properly handles the general case for
14474 * UTF-8 matching and multi-char folds */
14477 /* Use deprecated warning to increase the
14478 * chances of this being output */
14479 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
14486 /* Here is an above Latin1 character. We don't have the
14487 * rules hard-coded for it. First, get its fold. This is
14488 * the simple fold, as the multi-character folds have been
14489 * handled earlier and separated out */
14490 _to_uni_fold_flags(j, foldbuf, &foldlen,
14491 (ASCII_FOLD_RESTRICTED)
14492 ? FOLD_FLAGS_NOMIX_ASCII
14495 /* Single character fold of above Latin1. Add everything in
14496 * its fold closure to the list that this node should match.
14497 * The fold closures data structure is a hash with the keys
14498 * being the UTF-8 of every character that is folded to, like
14499 * 'k', and the values each an array of all code points that
14500 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14501 * Multi-character folds are not included */
14502 if ((listp = hv_fetch(PL_utf8_foldclosures,
14503 (char *) foldbuf, foldlen, FALSE)))
14505 AV* list = (AV*) *listp;
14507 for (k = 0; k <= av_tindex(list); k++) {
14508 SV** c_p = av_fetch(list, k, FALSE);
14511 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
14515 /* /aa doesn't allow folds between ASCII and non- */
14516 if ((ASCII_FOLD_RESTRICTED
14517 && (isASCII(c) != isASCII(j))))
14522 /* Folds under /l which cross the 255/256 boundary
14523 * are added to a separate list. (These are valid
14524 * only when the locale is UTF-8.) */
14525 if (c < 256 && LOC) {
14526 *use_list = add_cp_to_invlist(*use_list, c);
14530 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14532 cp_list = add_cp_to_invlist(cp_list, c);
14535 /* Similarly folds involving non-ascii Latin1
14536 * characters under /d are added to their list */
14537 depends_list = add_cp_to_invlist(depends_list,
14544 SvREFCNT_dec_NN(fold_intersection);
14547 /* Now that we have finished adding all the folds, there is no reason
14548 * to keep the foldable list separate */
14549 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14550 SvREFCNT_dec_NN(cp_foldable_list);
14553 /* And combine the result (if any) with any inversion list from posix
14554 * classes. The lists are kept separate up to now because we don't want to
14555 * fold the classes (folding of those is automatically handled by the swash
14556 * fetching code) */
14557 if (posixes || nposixes) {
14558 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14559 /* Under /a and /aa, nothing above ASCII matches these */
14560 _invlist_intersection(posixes,
14561 PL_XPosix_ptrs[_CC_ASCII],
14565 if (DEPENDS_SEMANTICS) {
14566 /* Under /d, everything in the upper half of the Latin1 range
14567 * matches these complements */
14568 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14570 else if (AT_LEAST_ASCII_RESTRICTED) {
14571 /* Under /a and /aa, everything above ASCII matches these
14573 _invlist_union_complement_2nd(nposixes,
14574 PL_XPosix_ptrs[_CC_ASCII],
14578 _invlist_union(posixes, nposixes, &posixes);
14579 SvREFCNT_dec_NN(nposixes);
14582 posixes = nposixes;
14585 if (! DEPENDS_SEMANTICS) {
14587 _invlist_union(cp_list, posixes, &cp_list);
14588 SvREFCNT_dec_NN(posixes);
14595 /* Under /d, we put into a separate list the Latin1 things that
14596 * match only when the target string is utf8 */
14597 SV* nonascii_but_latin1_properties = NULL;
14598 _invlist_intersection(posixes, PL_UpperLatin1,
14599 &nonascii_but_latin1_properties);
14600 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14603 _invlist_union(cp_list, posixes, &cp_list);
14604 SvREFCNT_dec_NN(posixes);
14610 if (depends_list) {
14611 _invlist_union(depends_list, nonascii_but_latin1_properties,
14613 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14616 depends_list = nonascii_but_latin1_properties;
14621 /* And combine the result (if any) with any inversion list from properties.
14622 * The lists are kept separate up to now so that we can distinguish the two
14623 * in regards to matching above-Unicode. A run-time warning is generated
14624 * if a Unicode property is matched against a non-Unicode code point. But,
14625 * we allow user-defined properties to match anything, without any warning,
14626 * and we also suppress the warning if there is a portion of the character
14627 * class that isn't a Unicode property, and which matches above Unicode, \W
14628 * or [\x{110000}] for example.
14629 * (Note that in this case, unlike the Posix one above, there is no
14630 * <depends_list>, because having a Unicode property forces Unicode
14635 /* If it matters to the final outcome, see if a non-property
14636 * component of the class matches above Unicode. If so, the
14637 * warning gets suppressed. This is true even if just a single
14638 * such code point is specified, as though not strictly correct if
14639 * another such code point is matched against, the fact that they
14640 * are using above-Unicode code points indicates they should know
14641 * the issues involved */
14643 warn_super = ! (invert
14644 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14647 _invlist_union(properties, cp_list, &cp_list);
14648 SvREFCNT_dec_NN(properties);
14651 cp_list = properties;
14655 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14659 /* Here, we have calculated what code points should be in the character
14662 * Now we can see about various optimizations. Fold calculation (which we
14663 * did above) needs to take place before inversion. Otherwise /[^k]/i
14664 * would invert to include K, which under /i would match k, which it
14665 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14666 * folded until runtime */
14668 /* If we didn't do folding, it's because some information isn't available
14669 * until runtime; set the run-time fold flag for these. (We don't have to
14670 * worry about properties folding, as that is taken care of by the swash
14671 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14672 * locales, or the class matches at least one 0-255 range code point */
14674 if (only_utf8_locale_list) {
14675 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14677 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14680 invlist_iterinit(cp_list);
14681 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14682 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14684 invlist_iterfinish(cp_list);
14688 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14689 * at compile time. Besides not inverting folded locale now, we can't
14690 * invert if there are things such as \w, which aren't known until runtime
14694 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14696 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14698 _invlist_invert(cp_list);
14700 /* Any swash can't be used as-is, because we've inverted things */
14702 SvREFCNT_dec_NN(swash);
14706 /* Clear the invert flag since have just done it here */
14711 *ret_invlist = cp_list;
14712 SvREFCNT_dec(swash);
14714 /* Discard the generated node */
14716 RExC_size = orig_size;
14719 RExC_emit = orig_emit;
14724 /* Some character classes are equivalent to other nodes. Such nodes take
14725 * up less room and generally fewer operations to execute than ANYOF nodes.
14726 * Above, we checked for and optimized into some such equivalents for
14727 * certain common classes that are easy to test. Getting to this point in
14728 * the code means that the class didn't get optimized there. Since this
14729 * code is only executed in Pass 2, it is too late to save space--it has
14730 * been allocated in Pass 1, and currently isn't given back. But turning
14731 * things into an EXACTish node can allow the optimizer to join it to any
14732 * adjacent such nodes. And if the class is equivalent to things like /./,
14733 * expensive run-time swashes can be avoided. Now that we have more
14734 * complete information, we can find things necessarily missed by the
14735 * earlier code. I (khw) am not sure how much to look for here. It would
14736 * be easy, but perhaps too slow, to check any candidates against all the
14737 * node types they could possibly match using _invlistEQ(). */
14742 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14743 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14745 /* We don't optimize if we are supposed to make sure all non-Unicode
14746 * code points raise a warning, as only ANYOF nodes have this check.
14748 && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14751 U8 op = END; /* The optimzation node-type */
14752 const char * cur_parse= RExC_parse;
14754 invlist_iterinit(cp_list);
14755 if (! invlist_iternext(cp_list, &start, &end)) {
14757 /* Here, the list is empty. This happens, for example, when a
14758 * Unicode property is the only thing in the character class, and
14759 * it doesn't match anything. (perluniprops.pod notes such
14762 *flagp |= HASWIDTH|SIMPLE;
14764 else if (start == end) { /* The range is a single code point */
14765 if (! invlist_iternext(cp_list, &start, &end)
14767 /* Don't do this optimization if it would require changing
14768 * the pattern to UTF-8 */
14769 && (start < 256 || UTF))
14771 /* Here, the list contains a single code point. Can optimize
14772 * into an EXACTish node */
14781 /* A locale node under folding with one code point can be
14782 * an EXACTFL, as its fold won't be calculated until
14788 /* Here, we are generally folding, but there is only one
14789 * code point to match. If we have to, we use an EXACT
14790 * node, but it would be better for joining with adjacent
14791 * nodes in the optimization pass if we used the same
14792 * EXACTFish node that any such are likely to be. We can
14793 * do this iff the code point doesn't participate in any
14794 * folds. For example, an EXACTF of a colon is the same as
14795 * an EXACT one, since nothing folds to or from a colon. */
14797 if (IS_IN_SOME_FOLD_L1(value)) {
14802 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14807 /* If we haven't found the node type, above, it means we
14808 * can use the prevailing one */
14810 op = compute_EXACTish(pRExC_state);
14815 else if (start == 0) {
14816 if (end == UV_MAX) {
14818 *flagp |= HASWIDTH|SIMPLE;
14821 else if (end == '\n' - 1
14822 && invlist_iternext(cp_list, &start, &end)
14823 && start == '\n' + 1 && end == UV_MAX)
14826 *flagp |= HASWIDTH|SIMPLE;
14830 invlist_iterfinish(cp_list);
14833 RExC_parse = (char *)orig_parse;
14834 RExC_emit = (regnode *)orig_emit;
14836 ret = reg_node(pRExC_state, op);
14838 RExC_parse = (char *)cur_parse;
14840 if (PL_regkind[op] == EXACT) {
14841 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14842 TRUE /* downgradable to EXACT */
14846 SvREFCNT_dec_NN(cp_list);
14851 /* Here, <cp_list> contains all the code points we can determine at
14852 * compile time that match under all conditions. Go through it, and
14853 * for things that belong in the bitmap, put them there, and delete from
14854 * <cp_list>. While we are at it, see if everything above 255 is in the
14855 * list, and if so, set a flag to speed up execution */
14857 populate_ANYOF_from_invlist(ret, &cp_list);
14860 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14863 /* Here, the bitmap has been populated with all the Latin1 code points that
14864 * always match. Can now add to the overall list those that match only
14865 * when the target string is UTF-8 (<depends_list>). */
14866 if (depends_list) {
14868 _invlist_union(cp_list, depends_list, &cp_list);
14869 SvREFCNT_dec_NN(depends_list);
14872 cp_list = depends_list;
14874 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14877 /* If there is a swash and more than one element, we can't use the swash in
14878 * the optimization below. */
14879 if (swash && element_count > 1) {
14880 SvREFCNT_dec_NN(swash);
14884 set_ANYOF_arg(pRExC_state, ret, cp_list,
14885 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14887 only_utf8_locale_list,
14888 swash, has_user_defined_property);
14890 *flagp |= HASWIDTH|SIMPLE;
14892 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14893 RExC_contains_locale = 1;
14899 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14902 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14903 regnode* const node,
14905 SV* const runtime_defns,
14906 SV* const only_utf8_locale_list,
14908 const bool has_user_defined_property)
14910 /* Sets the arg field of an ANYOF-type node 'node', using information about
14911 * the node passed-in. If there is nothing outside the node's bitmap, the
14912 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14913 * the count returned by add_data(), having allocated and stored an array,
14914 * av, that that count references, as follows:
14915 * av[0] stores the character class description in its textual form.
14916 * This is used later (regexec.c:Perl_regclass_swash()) to
14917 * initialize the appropriate swash, and is also useful for dumping
14918 * the regnode. This is set to &PL_sv_undef if the textual
14919 * description is not needed at run-time (as happens if the other
14920 * elements completely define the class)
14921 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14922 * computed from av[0]. But if no further computation need be done,
14923 * the swash is stored here now (and av[0] is &PL_sv_undef).
14924 * av[2] stores the inversion list of code points that match only if the
14925 * current locale is UTF-8
14926 * av[3] stores the cp_list inversion list for use in addition or instead
14927 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14928 * (Otherwise everything needed is already in av[0] and av[1])
14929 * av[4] is set if any component of the class is from a user-defined
14930 * property; used only if av[3] exists */
14934 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14936 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14937 assert(! (ANYOF_FLAGS(node)
14938 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14939 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14942 AV * const av = newAV();
14945 assert(ANYOF_FLAGS(node)
14946 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14948 av_store(av, 0, (runtime_defns)
14949 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14951 av_store(av, 1, swash);
14952 SvREFCNT_dec_NN(cp_list);
14955 av_store(av, 1, &PL_sv_undef);
14957 av_store(av, 3, cp_list);
14958 av_store(av, 4, newSVuv(has_user_defined_property));
14962 if (only_utf8_locale_list) {
14963 av_store(av, 2, only_utf8_locale_list);
14966 av_store(av, 2, &PL_sv_undef);
14969 rv = newRV_noinc(MUTABLE_SV(av));
14970 n = add_data(pRExC_state, STR_WITH_LEN("s"));
14971 RExC_rxi->data->data[n] = (void*)rv;
14977 /* reg_skipcomment()
14979 Absorbs an /x style # comments from the input stream.
14980 Returns true if there is more text remaining in the stream.
14981 Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
14982 terminates the pattern without including a newline.
14984 Note its the callers responsibility to ensure that we are
14985 actually in /x mode
14990 S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
14994 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
14996 while (RExC_parse < RExC_end)
14997 if (*RExC_parse++ == '\n') {
15002 /* we ran off the end of the pattern without ending
15003 the comment, so we have to add an \n when wrapping */
15004 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15012 Advances the parse position, and optionally absorbs
15013 "whitespace" from the inputstream.
15015 Without /x "whitespace" means (?#...) style comments only,
15016 with /x this means (?#...) and # comments and whitespace proper.
15018 Returns the RExC_parse point from BEFORE the scan occurs.
15020 This is the /x friendly way of saying RExC_parse++.
15024 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15026 char* const retval = RExC_parse++;
15028 PERL_ARGS_ASSERT_NEXTCHAR;
15031 if (RExC_end - RExC_parse >= 3
15032 && *RExC_parse == '('
15033 && RExC_parse[1] == '?'
15034 && RExC_parse[2] == '#')
15036 while (*RExC_parse != ')') {
15037 if (RExC_parse == RExC_end)
15038 FAIL("Sequence (?#... not terminated");
15044 if (RExC_flags & RXf_PMf_EXTENDED) {
15045 if (isSPACE(*RExC_parse)) {
15049 else if (*RExC_parse == '#') {
15050 if ( reg_skipcomment( pRExC_state ) )
15059 - reg_node - emit a node
15061 STATIC regnode * /* Location. */
15062 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15066 regnode * const ret = RExC_emit;
15067 GET_RE_DEBUG_FLAGS_DECL;
15069 PERL_ARGS_ASSERT_REG_NODE;
15072 SIZE_ALIGN(RExC_size);
15076 if (RExC_emit >= RExC_emit_bound)
15077 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15078 op, RExC_emit, RExC_emit_bound);
15080 NODE_ALIGN_FILL(ret);
15082 FILL_ADVANCE_NODE(ptr, op);
15083 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15084 #ifdef RE_TRACK_PATTERN_OFFSETS
15085 if (RExC_offsets) { /* MJD */
15087 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15088 "reg_node", __LINE__,
15090 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15091 ? "Overwriting end of array!\n" : "OK",
15092 (UV)(RExC_emit - RExC_emit_start),
15093 (UV)(RExC_parse - RExC_start),
15094 (UV)RExC_offsets[0]));
15095 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15103 - reganode - emit a node with an argument
15105 STATIC regnode * /* Location. */
15106 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15110 regnode * const ret = RExC_emit;
15111 GET_RE_DEBUG_FLAGS_DECL;
15113 PERL_ARGS_ASSERT_REGANODE;
15116 SIZE_ALIGN(RExC_size);
15121 assert(2==regarglen[op]+1);
15123 Anything larger than this has to allocate the extra amount.
15124 If we changed this to be:
15126 RExC_size += (1 + regarglen[op]);
15128 then it wouldn't matter. Its not clear what side effect
15129 might come from that so its not done so far.
15134 if (RExC_emit >= RExC_emit_bound)
15135 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15136 op, RExC_emit, RExC_emit_bound);
15138 NODE_ALIGN_FILL(ret);
15140 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15141 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15142 #ifdef RE_TRACK_PATTERN_OFFSETS
15143 if (RExC_offsets) { /* MJD */
15145 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15149 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15150 "Overwriting end of array!\n" : "OK",
15151 (UV)(RExC_emit - RExC_emit_start),
15152 (UV)(RExC_parse - RExC_start),
15153 (UV)RExC_offsets[0]));
15154 Set_Cur_Node_Offset;
15162 - reguni - emit (if appropriate) a Unicode character
15164 PERL_STATIC_INLINE STRLEN
15165 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15169 PERL_ARGS_ASSERT_REGUNI;
15171 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15175 - reginsert - insert an operator in front of already-emitted operand
15177 * Means relocating the operand.
15180 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15186 const int offset = regarglen[(U8)op];
15187 const int size = NODE_STEP_REGNODE + offset;
15188 GET_RE_DEBUG_FLAGS_DECL;
15190 PERL_ARGS_ASSERT_REGINSERT;
15191 PERL_UNUSED_ARG(depth);
15192 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15193 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15202 if (RExC_open_parens) {
15204 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15205 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15206 if ( RExC_open_parens[paren] >= opnd ) {
15207 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15208 RExC_open_parens[paren] += size;
15210 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15212 if ( RExC_close_parens[paren] >= opnd ) {
15213 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15214 RExC_close_parens[paren] += size;
15216 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15221 while (src > opnd) {
15222 StructCopy(--src, --dst, regnode);
15223 #ifdef RE_TRACK_PATTERN_OFFSETS
15224 if (RExC_offsets) { /* MJD 20010112 */
15226 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15230 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15231 ? "Overwriting end of array!\n" : "OK",
15232 (UV)(src - RExC_emit_start),
15233 (UV)(dst - RExC_emit_start),
15234 (UV)RExC_offsets[0]));
15235 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15236 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15242 place = opnd; /* Op node, where operand used to be. */
15243 #ifdef RE_TRACK_PATTERN_OFFSETS
15244 if (RExC_offsets) { /* MJD */
15246 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15250 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15251 ? "Overwriting end of array!\n" : "OK",
15252 (UV)(place - RExC_emit_start),
15253 (UV)(RExC_parse - RExC_start),
15254 (UV)RExC_offsets[0]));
15255 Set_Node_Offset(place, RExC_parse);
15256 Set_Node_Length(place, 1);
15259 src = NEXTOPER(place);
15260 FILL_ADVANCE_NODE(place, op);
15261 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15262 Zero(src, offset, regnode);
15266 - regtail - set the next-pointer at the end of a node chain of p to val.
15267 - SEE ALSO: regtail_study
15269 /* TODO: All three parms should be const */
15271 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15272 const regnode *val,U32 depth)
15276 GET_RE_DEBUG_FLAGS_DECL;
15278 PERL_ARGS_ASSERT_REGTAIL;
15280 PERL_UNUSED_ARG(depth);
15286 /* Find last node. */
15289 regnode * const temp = regnext(scan);
15291 SV * const mysv=sv_newmortal();
15292 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15293 regprop(RExC_rx, mysv, scan, NULL);
15294 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15295 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15296 (temp == NULL ? "->" : ""),
15297 (temp == NULL ? PL_reg_name[OP(val)] : "")
15305 if (reg_off_by_arg[OP(scan)]) {
15306 ARG_SET(scan, val - scan);
15309 NEXT_OFF(scan) = val - scan;
15315 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15316 - Look for optimizable sequences at the same time.
15317 - currently only looks for EXACT chains.
15319 This is experimental code. The idea is to use this routine to perform
15320 in place optimizations on branches and groups as they are constructed,
15321 with the long term intention of removing optimization from study_chunk so
15322 that it is purely analytical.
15324 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15325 to control which is which.
15328 /* TODO: All four parms should be const */
15331 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15332 const regnode *val,U32 depth)
15337 #ifdef EXPERIMENTAL_INPLACESCAN
15340 GET_RE_DEBUG_FLAGS_DECL;
15342 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15348 /* Find last node. */
15352 regnode * const temp = regnext(scan);
15353 #ifdef EXPERIMENTAL_INPLACESCAN
15354 if (PL_regkind[OP(scan)] == EXACT) {
15355 bool unfolded_multi_char; /* Unexamined in this routine */
15356 if (join_exact(pRExC_state, scan, &min,
15357 &unfolded_multi_char, 1, val, depth+1))
15362 switch (OP(scan)) {
15365 case EXACTFA_NO_TRIE:
15370 if( exact == PSEUDO )
15372 else if ( exact != OP(scan) )
15381 SV * const mysv=sv_newmortal();
15382 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15383 regprop(RExC_rx, mysv, scan, NULL);
15384 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15385 SvPV_nolen_const(mysv),
15386 REG_NODE_NUM(scan),
15387 PL_reg_name[exact]);
15394 SV * const mysv_val=sv_newmortal();
15395 DEBUG_PARSE_MSG("");
15396 regprop(RExC_rx, mysv_val, val, NULL);
15397 PerlIO_printf(Perl_debug_log,
15398 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15399 SvPV_nolen_const(mysv_val),
15400 (IV)REG_NODE_NUM(val),
15404 if (reg_off_by_arg[OP(scan)]) {
15405 ARG_SET(scan, val - scan);
15408 NEXT_OFF(scan) = val - scan;
15416 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15421 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15426 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15428 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15429 if (flags & (1<<bit)) {
15430 if (!set++ && lead)
15431 PerlIO_printf(Perl_debug_log, "%s",lead);
15432 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15437 PerlIO_printf(Perl_debug_log, "\n");
15439 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15444 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15450 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15452 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15453 if (flags & (1<<bit)) {
15454 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15457 if (!set++ && lead)
15458 PerlIO_printf(Perl_debug_log, "%s",lead);
15459 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15462 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15463 if (!set++ && lead) {
15464 PerlIO_printf(Perl_debug_log, "%s",lead);
15467 case REGEX_UNICODE_CHARSET:
15468 PerlIO_printf(Perl_debug_log, "UNICODE");
15470 case REGEX_LOCALE_CHARSET:
15471 PerlIO_printf(Perl_debug_log, "LOCALE");
15473 case REGEX_ASCII_RESTRICTED_CHARSET:
15474 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15476 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15477 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15480 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15486 PerlIO_printf(Perl_debug_log, "\n");
15488 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15494 Perl_regdump(pTHX_ const regexp *r)
15498 SV * const sv = sv_newmortal();
15499 SV *dsv= sv_newmortal();
15500 RXi_GET_DECL(r,ri);
15501 GET_RE_DEBUG_FLAGS_DECL;
15503 PERL_ARGS_ASSERT_REGDUMP;
15505 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15507 /* Header fields of interest. */
15508 if (r->anchored_substr) {
15509 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15510 RE_SV_DUMPLEN(r->anchored_substr), 30);
15511 PerlIO_printf(Perl_debug_log,
15512 "anchored %s%s at %"IVdf" ",
15513 s, RE_SV_TAIL(r->anchored_substr),
15514 (IV)r->anchored_offset);
15515 } else if (r->anchored_utf8) {
15516 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15517 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15518 PerlIO_printf(Perl_debug_log,
15519 "anchored utf8 %s%s at %"IVdf" ",
15520 s, RE_SV_TAIL(r->anchored_utf8),
15521 (IV)r->anchored_offset);
15523 if (r->float_substr) {
15524 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15525 RE_SV_DUMPLEN(r->float_substr), 30);
15526 PerlIO_printf(Perl_debug_log,
15527 "floating %s%s at %"IVdf"..%"UVuf" ",
15528 s, RE_SV_TAIL(r->float_substr),
15529 (IV)r->float_min_offset, (UV)r->float_max_offset);
15530 } else if (r->float_utf8) {
15531 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15532 RE_SV_DUMPLEN(r->float_utf8), 30);
15533 PerlIO_printf(Perl_debug_log,
15534 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15535 s, RE_SV_TAIL(r->float_utf8),
15536 (IV)r->float_min_offset, (UV)r->float_max_offset);
15538 if (r->check_substr || r->check_utf8)
15539 PerlIO_printf(Perl_debug_log,
15541 (r->check_substr == r->float_substr
15542 && r->check_utf8 == r->float_utf8
15543 ? "(checking floating" : "(checking anchored"));
15544 if (r->intflags & PREGf_NOSCAN)
15545 PerlIO_printf(Perl_debug_log, " noscan");
15546 if (r->extflags & RXf_CHECK_ALL)
15547 PerlIO_printf(Perl_debug_log, " isall");
15548 if (r->check_substr || r->check_utf8)
15549 PerlIO_printf(Perl_debug_log, ") ");
15551 if (ri->regstclass) {
15552 regprop(r, sv, ri->regstclass, NULL);
15553 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15555 if (r->intflags & PREGf_ANCH) {
15556 PerlIO_printf(Perl_debug_log, "anchored");
15557 if (r->intflags & PREGf_ANCH_BOL)
15558 PerlIO_printf(Perl_debug_log, "(BOL)");
15559 if (r->intflags & PREGf_ANCH_MBOL)
15560 PerlIO_printf(Perl_debug_log, "(MBOL)");
15561 if (r->intflags & PREGf_ANCH_SBOL)
15562 PerlIO_printf(Perl_debug_log, "(SBOL)");
15563 if (r->intflags & PREGf_ANCH_GPOS)
15564 PerlIO_printf(Perl_debug_log, "(GPOS)");
15565 PerlIO_putc(Perl_debug_log, ' ');
15567 if (r->intflags & PREGf_GPOS_SEEN)
15568 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15569 if (r->intflags & PREGf_SKIP)
15570 PerlIO_printf(Perl_debug_log, "plus ");
15571 if (r->intflags & PREGf_IMPLICIT)
15572 PerlIO_printf(Perl_debug_log, "implicit ");
15573 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15574 if (r->extflags & RXf_EVAL_SEEN)
15575 PerlIO_printf(Perl_debug_log, "with eval ");
15576 PerlIO_printf(Perl_debug_log, "\n");
15578 regdump_extflags("r->extflags: ",r->extflags);
15579 regdump_intflags("r->intflags: ",r->intflags);
15582 PERL_ARGS_ASSERT_REGDUMP;
15583 PERL_UNUSED_CONTEXT;
15584 PERL_UNUSED_ARG(r);
15585 #endif /* DEBUGGING */
15589 - regprop - printable representation of opcode, with run time support
15593 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15599 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15600 static const char * const anyofs[] = {
15601 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15602 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15603 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15604 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15605 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15606 || _CC_VERTSPACE != 16
15607 #error Need to adjust order of anyofs[]
15644 RXi_GET_DECL(prog,progi);
15645 GET_RE_DEBUG_FLAGS_DECL;
15647 PERL_ARGS_ASSERT_REGPROP;
15651 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15652 /* It would be nice to FAIL() here, but this may be called from
15653 regexec.c, and it would be hard to supply pRExC_state. */
15654 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15655 (int)OP(o), (int)REGNODE_MAX);
15656 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15658 k = PL_regkind[OP(o)];
15661 sv_catpvs(sv, " ");
15662 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15663 * is a crude hack but it may be the best for now since
15664 * we have no flag "this EXACTish node was UTF-8"
15666 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15667 PERL_PV_ESCAPE_UNI_DETECT |
15668 PERL_PV_ESCAPE_NONASCII |
15669 PERL_PV_PRETTY_ELLIPSES |
15670 PERL_PV_PRETTY_LTGT |
15671 PERL_PV_PRETTY_NOCLEAR
15673 } else if (k == TRIE) {
15674 /* print the details of the trie in dumpuntil instead, as
15675 * progi->data isn't available here */
15676 const char op = OP(o);
15677 const U32 n = ARG(o);
15678 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15679 (reg_ac_data *)progi->data->data[n] :
15681 const reg_trie_data * const trie
15682 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15684 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15685 DEBUG_TRIE_COMPILE_r(
15686 Perl_sv_catpvf(aTHX_ sv,
15687 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15688 (UV)trie->startstate,
15689 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15690 (UV)trie->wordcount,
15693 (UV)TRIE_CHARCOUNT(trie),
15694 (UV)trie->uniquecharcount
15697 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15698 sv_catpvs(sv, "[");
15699 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15701 : TRIE_BITMAP(trie));
15702 sv_catpvs(sv, "]");
15705 } else if (k == CURLY) {
15706 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15707 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15708 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15710 else if (k == WHILEM && o->flags) /* Ordinal/of */
15711 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15712 else if (k == REF || k == OPEN || k == CLOSE
15713 || k == GROUPP || OP(o)==ACCEPT)
15715 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15716 if ( RXp_PAREN_NAMES(prog) ) {
15717 if ( k != REF || (OP(o) < NREF)) {
15718 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15719 SV **name= av_fetch(list, ARG(o), 0 );
15721 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15724 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15725 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15726 I32 *nums=(I32*)SvPVX(sv_dat);
15727 SV **name= av_fetch(list, nums[0], 0 );
15730 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15731 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15732 (n ? "," : ""), (IV)nums[n]);
15734 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15738 if ( k == REF && reginfo) {
15739 U32 n = ARG(o); /* which paren pair */
15740 I32 ln = prog->offs[n].start;
15741 if (prog->lastparen < n || ln == -1)
15742 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15743 else if (ln == prog->offs[n].end)
15744 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15746 const char *s = reginfo->strbeg + ln;
15747 Perl_sv_catpvf(aTHX_ sv, ": ");
15748 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15749 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15752 } else if (k == GOSUB)
15753 /* Paren and offset */
15754 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15755 else if (k == VERB) {
15757 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15758 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15759 } else if (k == LOGICAL)
15760 /* 2: embedded, otherwise 1 */
15761 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15762 else if (k == ANYOF) {
15763 const U8 flags = ANYOF_FLAGS(o);
15767 if (flags & ANYOF_LOCALE_FLAGS)
15768 sv_catpvs(sv, "{loc}");
15769 if (flags & ANYOF_LOC_FOLD)
15770 sv_catpvs(sv, "{i}");
15771 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15772 if (flags & ANYOF_INVERT)
15773 sv_catpvs(sv, "^");
15775 /* output what the standard cp 0-255 bitmap matches */
15776 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15778 /* output any special charclass tests (used entirely under use
15780 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15782 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15783 if (ANYOF_POSIXL_TEST(o,i)) {
15784 sv_catpv(sv, anyofs[i]);
15790 if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15792 |ANYOF_NONBITMAP_NON_UTF8
15796 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15797 if (flags & ANYOF_INVERT)
15798 /*make sure the invert info is in each */
15799 sv_catpvs(sv, "^");
15802 if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15803 sv_catpvs(sv, "{non-utf8-latin1-all}");
15806 /* output information about the unicode matching */
15807 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15808 sv_catpvs(sv, "{unicode_all}");
15809 else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15810 SV *lv; /* Set if there is something outside the bit map. */
15811 bool byte_output = FALSE; /* If something in the bitmap has
15813 SV *only_utf8_locale;
15815 /* Get the stuff that wasn't in the bitmap */
15816 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15817 &lv, &only_utf8_locale);
15818 if (lv && lv != &PL_sv_undef) {
15819 char *s = savesvpv(lv);
15820 char * const origs = s;
15822 while (*s && *s != '\n')
15826 const char * const t = ++s;
15828 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15829 sv_catpvs(sv, "{outside bitmap}");
15832 sv_catpvs(sv, "{utf8}");
15836 sv_catpvs(sv, " ");
15842 /* Truncate very long output */
15843 if (s - origs > 256) {
15844 Perl_sv_catpvf(aTHX_ sv,
15846 (int) (s - origs - 1),
15852 else if (*s == '\t') {
15866 SvREFCNT_dec_NN(lv);
15869 if ((flags & ANYOF_LOC_FOLD)
15870 && only_utf8_locale
15871 && only_utf8_locale != &PL_sv_undef)
15874 int max_entries = 256;
15876 sv_catpvs(sv, "{utf8 locale}");
15877 invlist_iterinit(only_utf8_locale);
15878 while (invlist_iternext(only_utf8_locale,
15880 put_range(sv, start, end);
15882 if (max_entries < 0) {
15883 sv_catpvs(sv, "...");
15887 invlist_iterfinish(only_utf8_locale);
15892 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15894 else if (k == POSIXD || k == NPOSIXD) {
15895 U8 index = FLAGS(o) * 2;
15896 if (index < C_ARRAY_LENGTH(anyofs)) {
15897 if (*anyofs[index] != '[') {
15900 sv_catpv(sv, anyofs[index]);
15901 if (*anyofs[index] != '[') {
15906 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15909 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15910 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15912 PERL_UNUSED_CONTEXT;
15913 PERL_UNUSED_ARG(sv);
15914 PERL_UNUSED_ARG(o);
15915 PERL_UNUSED_ARG(prog);
15916 PERL_UNUSED_ARG(reginfo);
15917 #endif /* DEBUGGING */
15923 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15924 { /* Assume that RE_INTUIT is set */
15926 struct regexp *const prog = ReANY(r);
15927 GET_RE_DEBUG_FLAGS_DECL;
15929 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15930 PERL_UNUSED_CONTEXT;
15934 const char * const s = SvPV_nolen_const(prog->check_substr
15935 ? prog->check_substr : prog->check_utf8);
15937 if (!PL_colorset) reginitcolors();
15938 PerlIO_printf(Perl_debug_log,
15939 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15941 prog->check_substr ? "" : "utf8 ",
15942 PL_colors[5],PL_colors[0],
15945 (strlen(s) > 60 ? "..." : ""));
15948 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15954 handles refcounting and freeing the perl core regexp structure. When
15955 it is necessary to actually free the structure the first thing it
15956 does is call the 'free' method of the regexp_engine associated to
15957 the regexp, allowing the handling of the void *pprivate; member
15958 first. (This routine is not overridable by extensions, which is why
15959 the extensions free is called first.)
15961 See regdupe and regdupe_internal if you change anything here.
15963 #ifndef PERL_IN_XSUB_RE
15965 Perl_pregfree(pTHX_ REGEXP *r)
15971 Perl_pregfree2(pTHX_ REGEXP *rx)
15974 struct regexp *const r = ReANY(rx);
15975 GET_RE_DEBUG_FLAGS_DECL;
15977 PERL_ARGS_ASSERT_PREGFREE2;
15979 if (r->mother_re) {
15980 ReREFCNT_dec(r->mother_re);
15982 CALLREGFREE_PVT(rx); /* free the private data */
15983 SvREFCNT_dec(RXp_PAREN_NAMES(r));
15984 Safefree(r->xpv_len_u.xpvlenu_pv);
15987 SvREFCNT_dec(r->anchored_substr);
15988 SvREFCNT_dec(r->anchored_utf8);
15989 SvREFCNT_dec(r->float_substr);
15990 SvREFCNT_dec(r->float_utf8);
15991 Safefree(r->substrs);
15993 RX_MATCH_COPY_FREE(rx);
15994 #ifdef PERL_ANY_COW
15995 SvREFCNT_dec(r->saved_copy);
15998 SvREFCNT_dec(r->qr_anoncv);
15999 rx->sv_u.svu_rx = 0;
16004 This is a hacky workaround to the structural issue of match results
16005 being stored in the regexp structure which is in turn stored in
16006 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16007 could be PL_curpm in multiple contexts, and could require multiple
16008 result sets being associated with the pattern simultaneously, such
16009 as when doing a recursive match with (??{$qr})
16011 The solution is to make a lightweight copy of the regexp structure
16012 when a qr// is returned from the code executed by (??{$qr}) this
16013 lightweight copy doesn't actually own any of its data except for
16014 the starp/end and the actual regexp structure itself.
16020 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16022 struct regexp *ret;
16023 struct regexp *const r = ReANY(rx);
16024 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16026 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16029 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16031 SvOK_off((SV *)ret_x);
16033 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16034 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16035 made both spots point to the same regexp body.) */
16036 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16037 assert(!SvPVX(ret_x));
16038 ret_x->sv_u.svu_rx = temp->sv_any;
16039 temp->sv_any = NULL;
16040 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16041 SvREFCNT_dec_NN(temp);
16042 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16043 ing below will not set it. */
16044 SvCUR_set(ret_x, SvCUR(rx));
16047 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16048 sv_force_normal(sv) is called. */
16050 ret = ReANY(ret_x);
16052 SvFLAGS(ret_x) |= SvUTF8(rx);
16053 /* We share the same string buffer as the original regexp, on which we
16054 hold a reference count, incremented when mother_re is set below.
16055 The string pointer is copied here, being part of the regexp struct.
16057 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16058 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16060 const I32 npar = r->nparens+1;
16061 Newx(ret->offs, npar, regexp_paren_pair);
16062 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16065 Newx(ret->substrs, 1, struct reg_substr_data);
16066 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16068 SvREFCNT_inc_void(ret->anchored_substr);
16069 SvREFCNT_inc_void(ret->anchored_utf8);
16070 SvREFCNT_inc_void(ret->float_substr);
16071 SvREFCNT_inc_void(ret->float_utf8);
16073 /* check_substr and check_utf8, if non-NULL, point to either their
16074 anchored or float namesakes, and don't hold a second reference. */
16076 RX_MATCH_COPIED_off(ret_x);
16077 #ifdef PERL_ANY_COW
16078 ret->saved_copy = NULL;
16080 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16081 SvREFCNT_inc_void(ret->qr_anoncv);
16087 /* regfree_internal()
16089 Free the private data in a regexp. This is overloadable by
16090 extensions. Perl takes care of the regexp structure in pregfree(),
16091 this covers the *pprivate pointer which technically perl doesn't
16092 know about, however of course we have to handle the
16093 regexp_internal structure when no extension is in use.
16095 Note this is called before freeing anything in the regexp
16100 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16103 struct regexp *const r = ReANY(rx);
16104 RXi_GET_DECL(r,ri);
16105 GET_RE_DEBUG_FLAGS_DECL;
16107 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16113 SV *dsv= sv_newmortal();
16114 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16115 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16116 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16117 PL_colors[4],PL_colors[5],s);
16120 #ifdef RE_TRACK_PATTERN_OFFSETS
16122 Safefree(ri->u.offsets); /* 20010421 MJD */
16124 if (ri->code_blocks) {
16126 for (n = 0; n < ri->num_code_blocks; n++)
16127 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16128 Safefree(ri->code_blocks);
16132 int n = ri->data->count;
16135 /* If you add a ->what type here, update the comment in regcomp.h */
16136 switch (ri->data->what[n]) {
16142 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16145 Safefree(ri->data->data[n]);
16151 { /* Aho Corasick add-on structure for a trie node.
16152 Used in stclass optimization only */
16154 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16156 refcount = --aho->refcount;
16159 PerlMemShared_free(aho->states);
16160 PerlMemShared_free(aho->fail);
16161 /* do this last!!!! */
16162 PerlMemShared_free(ri->data->data[n]);
16163 PerlMemShared_free(ri->regstclass);
16169 /* trie structure. */
16171 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16173 refcount = --trie->refcount;
16176 PerlMemShared_free(trie->charmap);
16177 PerlMemShared_free(trie->states);
16178 PerlMemShared_free(trie->trans);
16180 PerlMemShared_free(trie->bitmap);
16182 PerlMemShared_free(trie->jump);
16183 PerlMemShared_free(trie->wordinfo);
16184 /* do this last!!!! */
16185 PerlMemShared_free(ri->data->data[n]);
16190 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16191 ri->data->what[n]);
16194 Safefree(ri->data->what);
16195 Safefree(ri->data);
16201 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16202 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16203 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16206 re_dup - duplicate a regexp.
16208 This routine is expected to clone a given regexp structure. It is only
16209 compiled under USE_ITHREADS.
16211 After all of the core data stored in struct regexp is duplicated
16212 the regexp_engine.dupe method is used to copy any private data
16213 stored in the *pprivate pointer. This allows extensions to handle
16214 any duplication it needs to do.
16216 See pregfree() and regfree_internal() if you change anything here.
16218 #if defined(USE_ITHREADS)
16219 #ifndef PERL_IN_XSUB_RE
16221 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16225 const struct regexp *r = ReANY(sstr);
16226 struct regexp *ret = ReANY(dstr);
16228 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16230 npar = r->nparens+1;
16231 Newx(ret->offs, npar, regexp_paren_pair);
16232 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16234 if (ret->substrs) {
16235 /* Do it this way to avoid reading from *r after the StructCopy().
16236 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16237 cache, it doesn't matter. */
16238 const bool anchored = r->check_substr
16239 ? r->check_substr == r->anchored_substr
16240 : r->check_utf8 == r->anchored_utf8;
16241 Newx(ret->substrs, 1, struct reg_substr_data);
16242 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16244 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16245 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16246 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16247 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16249 /* check_substr and check_utf8, if non-NULL, point to either their
16250 anchored or float namesakes, and don't hold a second reference. */
16252 if (ret->check_substr) {
16254 assert(r->check_utf8 == r->anchored_utf8);
16255 ret->check_substr = ret->anchored_substr;
16256 ret->check_utf8 = ret->anchored_utf8;
16258 assert(r->check_substr == r->float_substr);
16259 assert(r->check_utf8 == r->float_utf8);
16260 ret->check_substr = ret->float_substr;
16261 ret->check_utf8 = ret->float_utf8;
16263 } else if (ret->check_utf8) {
16265 ret->check_utf8 = ret->anchored_utf8;
16267 ret->check_utf8 = ret->float_utf8;
16272 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16273 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16276 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16278 if (RX_MATCH_COPIED(dstr))
16279 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16281 ret->subbeg = NULL;
16282 #ifdef PERL_ANY_COW
16283 ret->saved_copy = NULL;
16286 /* Whether mother_re be set or no, we need to copy the string. We
16287 cannot refrain from copying it when the storage points directly to
16288 our mother regexp, because that's
16289 1: a buffer in a different thread
16290 2: something we no longer hold a reference on
16291 so we need to copy it locally. */
16292 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16293 ret->mother_re = NULL;
16295 #endif /* PERL_IN_XSUB_RE */
16300 This is the internal complement to regdupe() which is used to copy
16301 the structure pointed to by the *pprivate pointer in the regexp.
16302 This is the core version of the extension overridable cloning hook.
16303 The regexp structure being duplicated will be copied by perl prior
16304 to this and will be provided as the regexp *r argument, however
16305 with the /old/ structures pprivate pointer value. Thus this routine
16306 may override any copying normally done by perl.
16308 It returns a pointer to the new regexp_internal structure.
16312 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16315 struct regexp *const r = ReANY(rx);
16316 regexp_internal *reti;
16318 RXi_GET_DECL(r,ri);
16320 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16324 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16325 char, regexp_internal);
16326 Copy(ri->program, reti->program, len+1, regnode);
16328 reti->num_code_blocks = ri->num_code_blocks;
16329 if (ri->code_blocks) {
16331 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16332 struct reg_code_block);
16333 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16334 struct reg_code_block);
16335 for (n = 0; n < ri->num_code_blocks; n++)
16336 reti->code_blocks[n].src_regex = (REGEXP*)
16337 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16340 reti->code_blocks = NULL;
16342 reti->regstclass = NULL;
16345 struct reg_data *d;
16346 const int count = ri->data->count;
16349 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16350 char, struct reg_data);
16351 Newx(d->what, count, U8);
16354 for (i = 0; i < count; i++) {
16355 d->what[i] = ri->data->what[i];
16356 switch (d->what[i]) {
16357 /* see also regcomp.h and regfree_internal() */
16358 case 'a': /* actually an AV, but the dup function is identical. */
16362 case 'u': /* actually an HV, but the dup function is identical. */
16363 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16366 /* This is cheating. */
16367 Newx(d->data[i], 1, regnode_ssc);
16368 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16369 reti->regstclass = (regnode*)d->data[i];
16372 /* Trie stclasses are readonly and can thus be shared
16373 * without duplication. We free the stclass in pregfree
16374 * when the corresponding reg_ac_data struct is freed.
16376 reti->regstclass= ri->regstclass;
16380 ((reg_trie_data*)ri->data->data[i])->refcount++;
16385 d->data[i] = ri->data->data[i];
16388 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16389 ri->data->what[i]);
16398 reti->name_list_idx = ri->name_list_idx;
16400 #ifdef RE_TRACK_PATTERN_OFFSETS
16401 if (ri->u.offsets) {
16402 Newx(reti->u.offsets, 2*len+1, U32);
16403 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16406 SetProgLen(reti,len);
16409 return (void*)reti;
16412 #endif /* USE_ITHREADS */
16414 #ifndef PERL_IN_XSUB_RE
16417 - regnext - dig the "next" pointer out of a node
16420 Perl_regnext(pTHX_ regnode *p)
16428 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16429 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16430 (int)OP(p), (int)REGNODE_MAX);
16433 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16442 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16445 STRLEN l1 = strlen(pat1);
16446 STRLEN l2 = strlen(pat2);
16449 const char *message;
16451 PERL_ARGS_ASSERT_RE_CROAK2;
16457 Copy(pat1, buf, l1 , char);
16458 Copy(pat2, buf + l1, l2 , char);
16459 buf[l1 + l2] = '\n';
16460 buf[l1 + l2 + 1] = '\0';
16461 va_start(args, pat2);
16462 msv = vmess(buf, &args);
16464 message = SvPV_const(msv,l1);
16467 Copy(message, buf, l1 , char);
16468 /* l1-1 to avoid \n */
16469 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16472 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16474 #ifndef PERL_IN_XSUB_RE
16476 Perl_save_re_context(pTHX)
16480 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16482 const REGEXP * const rx = PM_GETRE(PL_curpm);
16485 for (i = 1; i <= RX_NPARENS(rx); i++) {
16486 char digits[TYPE_CHARS(long)];
16487 const STRLEN len = my_snprintf(digits, sizeof(digits),
16489 GV *const *const gvp
16490 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16493 GV * const gv = *gvp;
16494 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16506 S_put_byte(pTHX_ SV *sv, int c)
16508 PERL_ARGS_ASSERT_PUT_BYTE;
16512 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16513 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16514 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16515 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16516 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16519 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16524 const char string = c;
16525 if (c == '-' || c == ']' || c == '\\' || c == '^')
16526 sv_catpvs(sv, "\\");
16527 sv_catpvn(sv, &string, 1);
16532 S_put_range(pTHX_ SV *sv, UV start, UV end)
16535 /* Appends to 'sv' a displayable version of the range of code points from
16536 * 'start' to 'end' */
16538 assert(start <= end);
16540 PERL_ARGS_ASSERT_PUT_RANGE;
16542 if (end - start < 3) { /* Individual chars in short ranges */
16543 for (; start <= end; start++)
16544 put_byte(sv, start);
16546 else if ( end > 255
16547 || ! isALPHANUMERIC(start)
16548 || ! isALPHANUMERIC(end)
16549 || isDIGIT(start) != isDIGIT(end)
16550 || isUPPER(start) != isUPPER(end)
16551 || isLOWER(start) != isLOWER(end)
16553 /* This final test should get optimized out except on EBCDIC
16554 * platforms, where it causes ranges that cross discontinuities
16555 * like i/j to be shown as hex instead of the misleading,
16556 * e.g. H-K (since that range includes more than H, I, J, K).
16558 || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16560 Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16562 (end < 256) ? end : 255);
16564 else { /* Here, the ends of the range are both digits, or both uppercase,
16565 or both lowercase; and there's no discontinuity in the range
16566 (which could happen on EBCDIC platforms) */
16567 put_byte(sv, start);
16568 sv_catpvs(sv, "-");
16574 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16576 /* Appends to 'sv' a displayable version of the innards of the bracketed
16577 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16578 * output anything */
16581 bool has_output_anything = FALSE;
16583 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16585 for (i = 0; i < 256; i++) {
16586 if (BITMAP_TEST((U8 *) bitmap,i)) {
16588 /* The character at index i should be output. Find the next
16589 * character that should NOT be output */
16591 for (j = i + 1; j < 256; j++) {
16592 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16597 /* Everything between them is a single range that should be output
16599 put_range(sv, i, j - 1);
16600 has_output_anything = TRUE;
16605 return has_output_anything;
16608 #define CLEAR_OPTSTART \
16609 if (optstart) STMT_START { \
16610 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
16611 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16615 #define DUMPUNTIL(b,e) \
16617 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16619 STATIC const regnode *
16620 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16621 const regnode *last, const regnode *plast,
16622 SV* sv, I32 indent, U32 depth)
16625 U8 op = PSEUDO; /* Arbitrary non-END op. */
16626 const regnode *next;
16627 const regnode *optstart= NULL;
16629 RXi_GET_DECL(r,ri);
16630 GET_RE_DEBUG_FLAGS_DECL;
16632 PERL_ARGS_ASSERT_DUMPUNTIL;
16634 #ifdef DEBUG_DUMPUNTIL
16635 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16636 last ? last-start : 0,plast ? plast-start : 0);
16639 if (plast && plast < last)
16642 while (PL_regkind[op] != END && (!last || node < last)) {
16643 /* While that wasn't END last time... */
16646 if (op == CLOSE || op == WHILEM)
16648 next = regnext((regnode *)node);
16651 if (OP(node) == OPTIMIZED) {
16652 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16659 regprop(r, sv, node, NULL);
16660 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16661 (int)(2*indent + 1), "", SvPVX_const(sv));
16663 if (OP(node) != OPTIMIZED) {
16664 if (next == NULL) /* Next ptr. */
16665 PerlIO_printf(Perl_debug_log, " (0)");
16666 else if (PL_regkind[(U8)op] == BRANCH
16667 && PL_regkind[OP(next)] != BRANCH )
16668 PerlIO_printf(Perl_debug_log, " (FAIL)");
16670 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16671 (void)PerlIO_putc(Perl_debug_log, '\n');
16675 if (PL_regkind[(U8)op] == BRANCHJ) {
16678 const regnode *nnode = (OP(next) == LONGJMP
16679 ? regnext((regnode *)next)
16681 if (last && nnode > last)
16683 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16686 else if (PL_regkind[(U8)op] == BRANCH) {
16688 DUMPUNTIL(NEXTOPER(node), next);
16690 else if ( PL_regkind[(U8)op] == TRIE ) {
16691 const regnode *this_trie = node;
16692 const char op = OP(node);
16693 const U32 n = ARG(node);
16694 const reg_ac_data * const ac = op>=AHOCORASICK ?
16695 (reg_ac_data *)ri->data->data[n] :
16697 const reg_trie_data * const trie =
16698 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16700 AV *const trie_words
16701 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16703 const regnode *nextbranch= NULL;
16706 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16707 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16709 PerlIO_printf(Perl_debug_log, "%*s%s ",
16710 (int)(2*(indent+3)), "",
16712 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16713 SvCUR(*elem_ptr), 60,
16714 PL_colors[0], PL_colors[1],
16716 ? PERL_PV_ESCAPE_UNI
16718 | PERL_PV_PRETTY_ELLIPSES
16719 | PERL_PV_PRETTY_LTGT
16724 U16 dist= trie->jump[word_idx+1];
16725 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16726 (UV)((dist ? this_trie + dist : next) - start));
16729 nextbranch= this_trie + trie->jump[0];
16730 DUMPUNTIL(this_trie + dist, nextbranch);
16732 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16733 nextbranch= regnext((regnode *)nextbranch);
16735 PerlIO_printf(Perl_debug_log, "\n");
16738 if (last && next > last)
16743 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16744 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16745 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16747 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16749 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16751 else if ( op == PLUS || op == STAR) {
16752 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16754 else if (PL_regkind[(U8)op] == ANYOF) {
16755 /* arglen 1 + class block */
16756 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16757 ? ANYOF_POSIXL_SKIP
16759 node = NEXTOPER(node);
16761 else if (PL_regkind[(U8)op] == EXACT) {
16762 /* Literal string, where present. */
16763 node += NODE_SZ_STR(node) - 1;
16764 node = NEXTOPER(node);
16767 node = NEXTOPER(node);
16768 node += regarglen[(U8)op];
16770 if (op == CURLYX || op == OPEN)
16774 #ifdef DEBUG_DUMPUNTIL
16775 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16780 #endif /* DEBUGGING */
16784 * c-indentation-style: bsd
16785 * c-basic-offset: 4
16786 * indent-tabs-mode: nil
16789 * ex: set ts=8 sts=4 sw=4 et: