5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 struct RExC_state_t {
107 U32 flags; /* RXf_* are we folding, multilining? */
108 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 SSize_t whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit_bound; /* First regnode outside of the
121 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
122 implies compiling, so don't emit */
123 regnode_ssc emit_dummy; /* placeholder for emit to point to;
124 large enough for the largest
125 non-EXACTish node, so can use it as
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 SSize_t size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN) plus
132 one. ("par" 0 is the whole
134 I32 nestroot; /* root parens we are in - used by
138 regnode **open_parens; /* pointers to open parens */
139 regnode **close_parens; /* pointers to close parens */
140 regnode *opend; /* END node in program */
141 I32 utf8; /* whether the pattern is utf8 or not */
142 I32 orig_utf8; /* whether the pattern was originally in utf8 */
143 /* XXX use this for future optimisation of case
144 * where pattern must be upgraded to utf8. */
145 I32 uni_semantics; /* If a d charset modifier should use unicode
146 rules, even if the pattern is not in
148 HV *paren_names; /* Paren names */
150 regnode **recurse; /* Recurse regops */
151 I32 recurse_count; /* Number of recurse regops */
152 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
154 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
158 I32 override_recoding;
159 I32 in_multi_char_class;
160 struct reg_code_block *code_blocks; /* positions of literal (?{})
162 int num_code_blocks; /* size of code_blocks[] */
163 int code_index; /* next code_blocks[] slot */
164 SSize_t maxlen; /* mininum possible number of chars in string to match */
165 #ifdef ADD_TO_REGEXEC
166 char *starttry; /* -Dr: where regtry was called. */
167 #define RExC_starttry (pRExC_state->starttry)
169 SV *runtime_code_qr; /* qr with the runtime code blocks */
171 const char *lastparse;
173 AV *paren_name_list; /* idx -> name */
174 #define RExC_lastparse (pRExC_state->lastparse)
175 #define RExC_lastnum (pRExC_state->lastnum)
176 #define RExC_paren_name_list (pRExC_state->paren_name_list)
180 #define RExC_flags (pRExC_state->flags)
181 #define RExC_pm_flags (pRExC_state->pm_flags)
182 #define RExC_precomp (pRExC_state->precomp)
183 #define RExC_rx_sv (pRExC_state->rx_sv)
184 #define RExC_rx (pRExC_state->rx)
185 #define RExC_rxi (pRExC_state->rxi)
186 #define RExC_start (pRExC_state->start)
187 #define RExC_end (pRExC_state->end)
188 #define RExC_parse (pRExC_state->parse)
189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
190 #ifdef RE_TRACK_PATTERN_OFFSETS
191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
194 #define RExC_emit (pRExC_state->emit)
195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_maxlen (pRExC_state->maxlen)
203 #define RExC_npar (pRExC_state->npar)
204 #define RExC_nestroot (pRExC_state->nestroot)
205 #define RExC_extralen (pRExC_state->extralen)
206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
207 #define RExC_utf8 (pRExC_state->utf8)
208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
210 #define RExC_open_parens (pRExC_state->open_parens)
211 #define RExC_close_parens (pRExC_state->close_parens)
212 #define RExC_opend (pRExC_state->opend)
213 #define RExC_paren_names (pRExC_state->paren_names)
214 #define RExC_recurse (pRExC_state->recurse)
215 #define RExC_recurse_count (pRExC_state->recurse_count)
216 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
217 #define RExC_study_chunk_recursed_bytes \
218 (pRExC_state->study_chunk_recursed_bytes)
219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
220 #define RExC_contains_locale (pRExC_state->contains_locale)
221 #define RExC_contains_i (pRExC_state->contains_i)
222 #define RExC_override_recoding (pRExC_state->override_recoding)
223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
228 ((*s) == '{' && regcurly(s)))
231 * Flags to be passed up and down.
233 #define WORST 0 /* Worst case. */
234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
237 * character. (There needs to be a case: in the switch statement in regexec.c
238 * for any node marked SIMPLE.) Note that this is not the same thing as
241 #define SPSTART 0x04 /* Starts with * or + */
242 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
244 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
248 /* whether trie related optimizations are enabled */
249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
250 #define TRIE_STUDY_OPT
251 #define FULL_TRIE_STUDY
257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
258 #define PBITVAL(paren) (1 << ((paren) & 7))
259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
263 #define REQUIRE_UTF8 STMT_START { \
265 *flagp = RESTART_UTF8; \
270 /* This converts the named class defined in regcomp.h to its equivalent class
271 * number defined in handy.h. */
272 #define namedclass_to_classnum(class) ((int) ((class) / 2))
273 #define classnum_to_namedclass(classnum) ((classnum) * 2)
275 #define _invlist_union_complement_2nd(a, b, output) \
276 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
277 #define _invlist_intersection_complement_2nd(a, b, output) \
278 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
280 /* About scan_data_t.
282 During optimisation we recurse through the regexp program performing
283 various inplace (keyhole style) optimisations. In addition study_chunk
284 and scan_commit populate this data structure with information about
285 what strings MUST appear in the pattern. We look for the longest
286 string that must appear at a fixed location, and we look for the
287 longest string that may appear at a floating location. So for instance
292 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
293 strings (because they follow a .* construct). study_chunk will identify
294 both FOO and BAR as being the longest fixed and floating strings respectively.
296 The strings can be composites, for instance
300 will result in a composite fixed substring 'foo'.
302 For each string some basic information is maintained:
304 - offset or min_offset
305 This is the position the string must appear at, or not before.
306 It also implicitly (when combined with minlenp) tells us how many
307 characters must match before the string we are searching for.
308 Likewise when combined with minlenp and the length of the string it
309 tells us how many characters must appear after the string we have
313 Only used for floating strings. This is the rightmost point that
314 the string can appear at. If set to SSize_t_MAX it indicates that the
315 string can occur infinitely far to the right.
318 A pointer to the minimum number of characters of the pattern that the
319 string was found inside. This is important as in the case of positive
320 lookahead or positive lookbehind we can have multiple patterns
325 The minimum length of the pattern overall is 3, the minimum length
326 of the lookahead part is 3, but the minimum length of the part that
327 will actually match is 1. So 'FOO's minimum length is 3, but the
328 minimum length for the F is 1. This is important as the minimum length
329 is used to determine offsets in front of and behind the string being
330 looked for. Since strings can be composites this is the length of the
331 pattern at the time it was committed with a scan_commit. Note that
332 the length is calculated by study_chunk, so that the minimum lengths
333 are not known until the full pattern has been compiled, thus the
334 pointer to the value.
338 In the case of lookbehind the string being searched for can be
339 offset past the start point of the final matching string.
340 If this value was just blithely removed from the min_offset it would
341 invalidate some of the calculations for how many chars must match
342 before or after (as they are derived from min_offset and minlen and
343 the length of the string being searched for).
344 When the final pattern is compiled and the data is moved from the
345 scan_data_t structure into the regexp structure the information
346 about lookbehind is factored in, with the information that would
347 have been lost precalculated in the end_shift field for the
350 The fields pos_min and pos_delta are used to store the minimum offset
351 and the delta to the maximum offset at the current point in the pattern.
355 typedef struct scan_data_t {
356 /*I32 len_min; unused */
357 /*I32 len_delta; unused */
361 SSize_t last_end; /* min value, <0 unless valid. */
362 SSize_t last_start_min;
363 SSize_t last_start_max;
364 SV **longest; /* Either &l_fixed, or &l_float. */
365 SV *longest_fixed; /* longest fixed string found in pattern */
366 SSize_t offset_fixed; /* offset where it starts */
367 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
368 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
369 SV *longest_float; /* longest floating string found in pattern */
370 SSize_t offset_float_min; /* earliest point in string it can appear */
371 SSize_t offset_float_max; /* latest point in string it can appear */
372 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
373 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
376 SSize_t *last_closep;
377 regnode_ssc *start_class;
380 /* The below is perhaps overboard, but this allows us to save a test at the
381 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
382 * and 'a' differ by a single bit; the same with the upper and lower case of
383 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
384 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
385 * then inverts it to form a mask, with just a single 0, in the bit position
386 * where the upper- and lowercase differ. XXX There are about 40 other
387 * instances in the Perl core where this micro-optimization could be used.
388 * Should decide if maintenance cost is worse, before changing those
390 * Returns a boolean as to whether or not 'v' is either a lowercase or
391 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
392 * compile-time constant, the generated code is better than some optimizing
393 * compilers figure out, amounting to a mask and test. The results are
394 * meaningless if 'c' is not one of [A-Za-z] */
395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
396 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
399 * Forward declarations for pregcomp()'s friends.
402 static const scan_data_t zero_scan_data =
403 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
405 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
406 #define SF_BEFORE_SEOL 0x0001
407 #define SF_BEFORE_MEOL 0x0002
408 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
409 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
411 #define SF_FIX_SHIFT_EOL (+2)
412 #define SF_FL_SHIFT_EOL (+4)
414 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
415 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
417 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
418 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
419 #define SF_IS_INF 0x0040
420 #define SF_HAS_PAR 0x0080
421 #define SF_IN_PAR 0x0100
422 #define SF_HAS_EVAL 0x0200
423 #define SCF_DO_SUBSTR 0x0400
424 #define SCF_DO_STCLASS_AND 0x0800
425 #define SCF_DO_STCLASS_OR 0x1000
426 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
427 #define SCF_WHILEM_VISITED_POS 0x2000
429 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
430 #define SCF_SEEN_ACCEPT 0x8000
431 #define SCF_TRIE_DOING_RESTUDY 0x10000
433 #define UTF cBOOL(RExC_utf8)
435 /* The enums for all these are ordered so things work out correctly */
436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
438 == REGEX_DEPENDS_CHARSET)
439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
441 >= REGEX_UNICODE_CHARSET)
442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
443 == REGEX_ASCII_RESTRICTED_CHARSET)
444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
445 >= REGEX_ASCII_RESTRICTED_CHARSET)
446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
447 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
451 /* For programs that want to be strictly Unicode compatible by dying if any
452 * attempt is made to match a non-Unicode code point against a Unicode
454 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
456 #define OOB_NAMEDCLASS -1
458 /* There is no code point that is out-of-bounds, so this is problematic. But
459 * its only current use is to initialize a variable that is always set before
461 #define OOB_UNICODE 0xDEADBEEF
463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
467 /* length of regex to show in messages that don't mark a position within */
468 #define RegexLengthToShowInErrorMessages 127
471 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
472 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
473 * op/pragma/warn/regcomp.
475 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
476 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
478 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
479 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
481 #define REPORT_LOCATION_ARGS(offset) \
482 UTF8fARG(UTF, offset, RExC_precomp), \
483 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
486 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
487 * arg. Show regex, up to a maximum length. If it's too long, chop and add
490 #define _FAIL(code) STMT_START { \
491 const char *ellipses = ""; \
492 IV len = RExC_end - RExC_precomp; \
495 SAVEFREESV(RExC_rx_sv); \
496 if (len > RegexLengthToShowInErrorMessages) { \
497 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
498 len = RegexLengthToShowInErrorMessages - 10; \
504 #define FAIL(msg) _FAIL( \
505 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
506 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
508 #define FAIL2(msg,arg) _FAIL( \
509 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
510 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
513 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
515 #define Simple_vFAIL(m) STMT_START { \
516 const IV offset = RExC_parse - RExC_precomp; \
517 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
518 m, REPORT_LOCATION_ARGS(offset)); \
522 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
524 #define vFAIL(m) STMT_START { \
526 SAVEFREESV(RExC_rx_sv); \
531 * Like Simple_vFAIL(), but accepts two arguments.
533 #define Simple_vFAIL2(m,a1) STMT_START { \
534 const IV offset = RExC_parse - RExC_precomp; \
535 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
536 REPORT_LOCATION_ARGS(offset)); \
540 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
542 #define vFAIL2(m,a1) STMT_START { \
544 SAVEFREESV(RExC_rx_sv); \
545 Simple_vFAIL2(m, a1); \
550 * Like Simple_vFAIL(), but accepts three arguments.
552 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
553 const IV offset = RExC_parse - RExC_precomp; \
554 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
555 REPORT_LOCATION_ARGS(offset)); \
559 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
561 #define vFAIL3(m,a1,a2) STMT_START { \
563 SAVEFREESV(RExC_rx_sv); \
564 Simple_vFAIL3(m, a1, a2); \
568 * Like Simple_vFAIL(), but accepts four arguments.
570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
571 const IV offset = RExC_parse - RExC_precomp; \
572 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
573 REPORT_LOCATION_ARGS(offset)); \
576 #define vFAIL4(m,a1,a2,a3) STMT_START { \
578 SAVEFREESV(RExC_rx_sv); \
579 Simple_vFAIL4(m, a1, a2, a3); \
582 /* A specialized version of vFAIL2 that works with UTF8f */
583 #define vFAIL2utf8f(m, a1) STMT_START { \
584 const IV offset = RExC_parse - RExC_precomp; \
586 SAVEFREESV(RExC_rx_sv); \
587 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
588 REPORT_LOCATION_ARGS(offset)); \
592 /* m is not necessarily a "literal string", in this macro */
593 #define reg_warn_non_literal_string(loc, m) STMT_START { \
594 const IV offset = loc - RExC_precomp; \
595 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
596 m, REPORT_LOCATION_ARGS(offset)); \
599 #define ckWARNreg(loc,m) STMT_START { \
600 const IV offset = loc - RExC_precomp; \
601 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
602 REPORT_LOCATION_ARGS(offset)); \
605 #define vWARN_dep(loc, m) STMT_START { \
606 const IV offset = loc - RExC_precomp; \
607 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
608 REPORT_LOCATION_ARGS(offset)); \
611 #define ckWARNdep(loc,m) STMT_START { \
612 const IV offset = loc - RExC_precomp; \
613 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
615 REPORT_LOCATION_ARGS(offset)); \
618 #define ckWARNregdep(loc,m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
622 REPORT_LOCATION_ARGS(offset)); \
625 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
629 a1, REPORT_LOCATION_ARGS(offset)); \
632 #define ckWARN2reg(loc, m, a1) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, REPORT_LOCATION_ARGS(offset)); \
638 #define vWARN3(loc, m, a1, a2) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
641 a1, a2, REPORT_LOCATION_ARGS(offset)); \
644 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
645 const IV offset = loc - RExC_precomp; \
646 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
647 a1, a2, REPORT_LOCATION_ARGS(offset)); \
650 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
653 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
657 const IV offset = loc - RExC_precomp; \
658 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
659 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
663 const IV offset = loc - RExC_precomp; \
664 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
665 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
669 /* Allow for side effects in s */
670 #define REGC(c,s) STMT_START { \
671 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
674 /* Macros for recording node offsets. 20001227 mjd@plover.com
675 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
676 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
677 * Element 0 holds the number n.
678 * Position is 1 indexed.
680 #ifndef RE_TRACK_PATTERN_OFFSETS
681 #define Set_Node_Offset_To_R(node,byte)
682 #define Set_Node_Offset(node,byte)
683 #define Set_Cur_Node_Offset
684 #define Set_Node_Length_To_R(node,len)
685 #define Set_Node_Length(node,len)
686 #define Set_Node_Cur_Length(node,start)
687 #define Node_Offset(n)
688 #define Node_Length(n)
689 #define Set_Node_Offset_Length(node,offset,len)
690 #define ProgLen(ri) ri->u.proglen
691 #define SetProgLen(ri,x) ri->u.proglen = x
693 #define ProgLen(ri) ri->u.offsets[0]
694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
695 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
697 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
698 __LINE__, (int)(node), (int)(byte))); \
700 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
703 RExC_offsets[2*(node)-1] = (byte); \
708 #define Set_Node_Offset(node,byte) \
709 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
712 #define Set_Node_Length_To_R(node,len) STMT_START { \
714 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
715 __LINE__, (int)(node), (int)(len))); \
717 Perl_croak(aTHX_ "value of node is %d in Length macro", \
720 RExC_offsets[2*(node)] = (len); \
725 #define Set_Node_Length(node,len) \
726 Set_Node_Length_To_R((node)-RExC_emit_start, len)
727 #define Set_Node_Cur_Length(node, start) \
728 Set_Node_Length(node, RExC_parse - start)
730 /* Get offsets and lengths */
731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
734 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
735 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
736 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
741 #define EXPERIMENTAL_INPLACESCAN
742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
744 #define DEBUG_RExC_seen() \
745 DEBUG_OPTIMISE_MORE_r({ \
746 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
748 if (RExC_seen & REG_ZERO_LEN_SEEN) \
749 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
751 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
752 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
754 if (RExC_seen & REG_GPOS_SEEN) \
755 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
757 if (RExC_seen & REG_CANY_SEEN) \
758 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
760 if (RExC_seen & REG_RECURSE_SEEN) \
761 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
763 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
764 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
766 if (RExC_seen & REG_VERBARG_SEEN) \
767 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
769 if (RExC_seen & REG_CUTGROUP_SEEN) \
770 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
772 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
773 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
775 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
776 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
778 if (RExC_seen & REG_GOSTART_SEEN) \
779 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
781 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
782 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
784 PerlIO_printf(Perl_debug_log,"\n"); \
787 #define DEBUG_STUDYDATA(str,data,depth) \
788 DEBUG_OPTIMISE_MORE_r(if(data){ \
789 PerlIO_printf(Perl_debug_log, \
790 "%*s" str "Pos:%"IVdf"/%"IVdf \
791 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
792 (int)(depth)*2, "", \
793 (IV)((data)->pos_min), \
794 (IV)((data)->pos_delta), \
795 (UV)((data)->flags), \
796 (IV)((data)->whilem_c), \
797 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
798 is_inf ? "INF " : "" \
800 if ((data)->last_found) \
801 PerlIO_printf(Perl_debug_log, \
802 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
803 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
804 SvPVX_const((data)->last_found), \
805 (IV)((data)->last_end), \
806 (IV)((data)->last_start_min), \
807 (IV)((data)->last_start_max), \
808 ((data)->longest && \
809 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
810 SvPVX_const((data)->longest_fixed), \
811 (IV)((data)->offset_fixed), \
812 ((data)->longest && \
813 (data)->longest==&((data)->longest_float)) ? "*" : "", \
814 SvPVX_const((data)->longest_float), \
815 (IV)((data)->offset_float_min), \
816 (IV)((data)->offset_float_max) \
818 PerlIO_printf(Perl_debug_log,"\n"); \
821 /* Mark that we cannot extend a found fixed substring at this point.
822 Update the longest found anchored substring and the longest found
823 floating substrings if needed. */
826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
827 SSize_t *minlenp, int is_inf)
829 const STRLEN l = CHR_SVLEN(data->last_found);
830 const STRLEN old_l = CHR_SVLEN(*data->longest);
831 GET_RE_DEBUG_FLAGS_DECL;
833 PERL_ARGS_ASSERT_SCAN_COMMIT;
835 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
836 SvSetMagicSV(*data->longest, data->last_found);
837 if (*data->longest == data->longest_fixed) {
838 data->offset_fixed = l ? data->last_start_min : data->pos_min;
839 if (data->flags & SF_BEFORE_EOL)
841 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
843 data->flags &= ~SF_FIX_BEFORE_EOL;
844 data->minlen_fixed=minlenp;
845 data->lookbehind_fixed=0;
847 else { /* *data->longest == data->longest_float */
848 data->offset_float_min = l ? data->last_start_min : data->pos_min;
849 data->offset_float_max = (l
850 ? data->last_start_max
851 : (data->pos_delta == SSize_t_MAX
853 : data->pos_min + data->pos_delta));
855 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
856 data->offset_float_max = SSize_t_MAX;
857 if (data->flags & SF_BEFORE_EOL)
859 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
861 data->flags &= ~SF_FL_BEFORE_EOL;
862 data->minlen_float=minlenp;
863 data->lookbehind_float=0;
866 SvCUR_set(data->last_found, 0);
868 SV * const sv = data->last_found;
869 if (SvUTF8(sv) && SvMAGICAL(sv)) {
870 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
876 data->flags &= ~SF_BEFORE_EOL;
877 DEBUG_STUDYDATA("commit: ",data,0);
880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
881 * list that describes which code points it matches */
884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
886 /* Set the SSC 'ssc' to match an empty string or any code point */
888 PERL_ARGS_ASSERT_SSC_ANYTHING;
890 assert(is_ANYOF_SYNTHETIC(ssc));
892 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
893 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
894 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
898 S_ssc_is_anything(const regnode_ssc *ssc)
900 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
901 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
902 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
903 * in any way, so there's no point in using it */
908 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
910 assert(is_ANYOF_SYNTHETIC(ssc));
912 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
916 /* See if the list consists solely of the range 0 - Infinity */
917 invlist_iterinit(ssc->invlist);
918 ret = invlist_iternext(ssc->invlist, &start, &end)
922 invlist_iterfinish(ssc->invlist);
928 /* If e.g., both \w and \W are set, matches everything */
929 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
931 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
932 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
944 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
945 * string, any code point, or any posix class under locale */
947 PERL_ARGS_ASSERT_SSC_INIT;
949 Zero(ssc, 1, regnode_ssc);
950 set_ANYOF_SYNTHETIC(ssc);
951 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
954 /* If any portion of the regex is to operate under locale rules,
955 * initialization includes it. The reason this isn't done for all regexes
956 * is that the optimizer was written under the assumption that locale was
957 * all-or-nothing. Given the complexity and lack of documentation in the
958 * optimizer, and that there are inadequate test cases for locale, many
959 * parts of it may not work properly, it is safest to avoid locale unless
961 if (RExC_contains_locale) {
962 ANYOF_POSIXL_SETALL(ssc);
965 ANYOF_POSIXL_ZERO(ssc);
970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
971 const regnode_ssc *ssc)
973 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
974 * to the list of code points matched, and locale posix classes; hence does
975 * not check its flags) */
980 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
982 assert(is_ANYOF_SYNTHETIC(ssc));
984 invlist_iterinit(ssc->invlist);
985 ret = invlist_iternext(ssc->invlist, &start, &end)
989 invlist_iterfinish(ssc->invlist);
995 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004 const regnode_charclass* const node)
1006 /* Returns a mortal inversion list defining which code points are matched
1007 * by 'node', which is of type ANYOF. Handles complementing the result if
1008 * appropriate. If some code points aren't knowable at this time, the
1009 * returned list must, and will, contain every code point that is a
1012 SV* invlist = sv_2mortal(_new_invlist(0));
1013 SV* only_utf8_locale_invlist = NULL;
1015 const U32 n = ARG(node);
1016 bool new_node_has_latin1 = FALSE;
1018 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1020 /* Look at the data structure created by S_set_ANYOF_arg() */
1021 if (n != ANYOF_NONBITMAP_EMPTY) {
1022 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023 AV * const av = MUTABLE_AV(SvRV(rv));
1024 SV **const ary = AvARRAY(av);
1025 assert(RExC_rxi->data->what[n] == 's');
1027 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1030 else if (ary[0] && ary[0] != &PL_sv_undef) {
1032 /* Here, no compile-time swash, and there are things that won't be
1033 * known until runtime -- we have to assume it could be anything */
1034 return _add_range_to_invlist(invlist, 0, UV_MAX);
1036 else if (ary[3] && ary[3] != &PL_sv_undef) {
1038 /* Here no compile-time swash, and no run-time only data. Use the
1039 * node's inversion list */
1040 invlist = sv_2mortal(invlist_clone(ary[3]));
1043 /* Get the code points valid only under UTF-8 locales */
1044 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045 && ary[2] && ary[2] != &PL_sv_undef)
1047 only_utf8_locale_invlist = ary[2];
1051 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052 * inversion list for the others, but if there are code points that should
1053 * match only conditionally on the target string being UTF-8, those are
1054 * placed in the inversion list, and not the bitmap. Since there are
1055 * circumstances under which they could match, they are included in the
1056 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1057 * here, so that when we invert below, the end result actually does include
1058 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1059 * before we add the unconditionally matched code points */
1060 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061 _invlist_intersection_complement_2nd(invlist,
1066 /* Add in the points from the bit map */
1067 for (i = 0; i < 256; i++) {
1068 if (ANYOF_BITMAP_TEST(node, i)) {
1069 invlist = add_cp_to_invlist(invlist, i);
1070 new_node_has_latin1 = TRUE;
1074 /* If this can match all upper Latin1 code points, have to add them
1076 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1080 /* Similarly for these */
1081 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1085 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086 _invlist_invert(invlist);
1088 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1090 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091 * locale. We can skip this if there are no 0-255 at all. */
1092 _invlist_union(invlist, PL_Latin1, &invlist);
1095 /* Similarly add the UTF-8 locale possible matches. These have to be
1096 * deferred until after the non-UTF-8 locale ones are taken care of just
1097 * above, or it leads to wrong results under ANYOF_INVERT */
1098 if (only_utf8_locale_invlist) {
1099 _invlist_union_maybe_complement_2nd(invlist,
1100 only_utf8_locale_invlist,
1101 ANYOF_FLAGS(node) & ANYOF_INVERT,
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero ssc_init
1111 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1114 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1115 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1116 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120 const regnode_charclass *and_with)
1122 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123 * another SSC or a regular ANYOF class. Can create false positives. */
1128 PERL_ARGS_ASSERT_SSC_AND;
1130 assert(is_ANYOF_SYNTHETIC(ssc));
1132 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133 * the code point inversion list and just the relevant flags */
1134 if (is_ANYOF_SYNTHETIC(and_with)) {
1135 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136 anded_flags = ANYOF_FLAGS(and_with);
1138 /* XXX This is a kludge around what appears to be deficiencies in the
1139 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1140 * there are paths through the optimizer where it doesn't get weeded
1141 * out when it should. And if we don't make some extra provision for
1142 * it like the code just below, it doesn't get added when it should.
1143 * This solution is to add it only when AND'ing, which is here, and
1144 * only when what is being AND'ed is the pristine, original node
1145 * matching anything. Thus it is like adding it to ssc_anything() but
1146 * only when the result is to be AND'ed. Probably the same solution
1147 * could be adopted for the same problem we have with /l matching,
1148 * which is solved differently in S_ssc_init(), and that would lead to
1149 * fewer false positives than that solution has. But if this solution
1150 * creates bugs, the consequences are only that a warning isn't raised
1151 * that should be; while the consequences for having /l bugs is
1152 * incorrect matches */
1153 if (ssc_is_anything((regnode_ssc *)and_with)) {
1154 anded_flags |= ANYOF_WARN_SUPER;
1158 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1162 ANYOF_FLAGS(ssc) &= anded_flags;
1164 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166 * 'and_with' may be inverted. When not inverted, we have the situation of
1168 * (C1 | P1) & (C2 | P2)
1169 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1172 * <= ((C1 & C2) | P1 | P2)
1173 * Alternatively, the last few steps could be:
1174 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1176 * <= (C1 | C2 | (P1 & P2))
1177 * We favor the second approach if either P1 or P2 is non-empty. This is
1178 * because these components are a barrier to doing optimizations, as what
1179 * they match cannot be known until the moment of matching as they are
1180 * dependent on the current locale, 'AND"ing them likely will reduce or
1182 * But we can do better if we know that C1,P1 are in their initial state (a
1183 * frequent occurrence), each matching everything:
1184 * (<everything>) & (C2 | P2) = C2 | P2
1185 * Similarly, if C2,P2 are in their initial state (again a frequent
1186 * occurrence), the result is a no-op
1187 * (C1 | P1) & (<everything>) = C1 | P1
1190 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1191 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192 * <= (C1 & ~C2) | (P1 & ~P2)
1195 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196 && ! is_ANYOF_SYNTHETIC(and_with))
1200 ssc_intersection(ssc,
1202 FALSE /* Has already been inverted */
1205 /* If either P1 or P2 is empty, the intersection will be also; can skip
1207 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208 ANYOF_POSIXL_ZERO(ssc);
1210 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1212 /* Note that the Posix class component P from 'and_with' actually
1214 * P = Pa | Pb | ... | Pn
1215 * where each component is one posix class, such as in [\w\s].
1217 * ~P = ~(Pa | Pb | ... | Pn)
1218 * = ~Pa & ~Pb & ... & ~Pn
1219 * <= ~Pa | ~Pb | ... | ~Pn
1220 * The last is something we can easily calculate, but unfortunately
1221 * is likely to have many false positives. We could do better
1222 * in some (but certainly not all) instances if two classes in
1223 * P have known relationships. For example
1224 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1226 * :lower: & :print: = :lower:
1227 * And similarly for classes that must be disjoint. For example,
1228 * since \s and \w can have no elements in common based on rules in
1229 * the POSIX standard,
1230 * \w & ^\S = nothing
1231 * Unfortunately, some vendor locales do not meet the Posix
1232 * standard, in particular almost everything by Microsoft.
1233 * The loop below just changes e.g., \w into \W and vice versa */
1235 regnode_charclass_posixl temp;
1236 int add = 1; /* To calculate the index of the complement */
1238 ANYOF_POSIXL_ZERO(&temp);
1239 for (i = 0; i < ANYOF_MAX; i++) {
1241 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1244 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245 ANYOF_POSIXL_SET(&temp, i + add);
1247 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1249 ANYOF_POSIXL_AND(&temp, ssc);
1251 } /* else ssc already has no posixes */
1252 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1253 in its initial state */
1254 else if (! is_ANYOF_SYNTHETIC(and_with)
1255 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1257 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258 * copy it over 'ssc' */
1259 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260 if (is_ANYOF_SYNTHETIC(and_with)) {
1261 StructCopy(and_with, ssc, regnode_ssc);
1264 ssc->invlist = anded_cp_list;
1265 ANYOF_POSIXL_ZERO(ssc);
1266 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1271 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1274 /* One or the other of P1, P2 is non-empty. */
1275 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1278 ssc_union(ssc, anded_cp_list, FALSE);
1280 else { /* P1 = P2 = empty */
1281 ssc_intersection(ssc, anded_cp_list, FALSE);
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288 const regnode_charclass *or_with)
1290 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291 * another SSC or a regular ANYOF class. Can create false positives if
1292 * 'or_with' is to be inverted. */
1297 PERL_ARGS_ASSERT_SSC_OR;
1299 assert(is_ANYOF_SYNTHETIC(ssc));
1301 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302 * the code point inversion list and just the relevant flags */
1303 if (is_ANYOF_SYNTHETIC(or_with)) {
1304 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305 ored_flags = ANYOF_FLAGS(or_with);
1308 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1312 ANYOF_FLAGS(ssc) |= ored_flags;
1314 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316 * 'or_with' may be inverted. When not inverted, we have the simple
1317 * situation of computing:
1318 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1319 * If P1|P2 yields a situation with both a class and its complement are
1320 * set, like having both \w and \W, this matches all code points, and we
1321 * can delete these from the P component of the ssc going forward. XXX We
1322 * might be able to delete all the P components, but I (khw) am not certain
1323 * about this, and it is better to be safe.
1326 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1327 * <= (C1 | P1) | ~C2
1328 * <= (C1 | ~C2) | P1
1329 * (which results in actually simpler code than the non-inverted case)
1332 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333 && ! is_ANYOF_SYNTHETIC(or_with))
1335 /* We ignore P2, leaving P1 going forward */
1336 } /* else Not inverted */
1337 else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1341 for (i = 0; i < ANYOF_MAX; i += 2) {
1342 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1344 ssc_match_all_cp(ssc);
1345 ANYOF_POSIXL_CLEAR(ssc, i);
1346 ANYOF_POSIXL_CLEAR(ssc, i+1);
1354 FALSE /* Already has been inverted */
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1361 PERL_ARGS_ASSERT_SSC_UNION;
1363 assert(is_ANYOF_SYNTHETIC(ssc));
1365 _invlist_union_maybe_complement_2nd(ssc->invlist,
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1374 const bool invert2nd)
1376 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1378 assert(is_ANYOF_SYNTHETIC(ssc));
1380 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1389 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1391 assert(is_ANYOF_SYNTHETIC(ssc));
1393 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1399 /* AND just the single code point 'cp' into the SSC 'ssc' */
1401 SV* cp_list = _new_invlist(2);
1403 PERL_ARGS_ASSERT_SSC_CP_AND;
1405 assert(is_ANYOF_SYNTHETIC(ssc));
1407 cp_list = add_cp_to_invlist(cp_list, cp);
1408 ssc_intersection(ssc, cp_list,
1409 FALSE /* Not inverted */
1411 SvREFCNT_dec_NN(cp_list);
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1417 /* Set the SSC 'ssc' to not match any locale things */
1418 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1420 assert(is_ANYOF_SYNTHETIC(ssc));
1422 ANYOF_POSIXL_ZERO(ssc);
1423 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1429 /* The inversion list in the SSC is marked mortal; now we need a more
1430 * permanent copy, which is stored the same way that is done in a regular
1431 * ANYOF node, with the first 256 code points in a bit map */
1433 SV* invlist = invlist_clone(ssc->invlist);
1435 PERL_ARGS_ASSERT_SSC_FINALIZE;
1437 assert(is_ANYOF_SYNTHETIC(ssc));
1439 /* The code in this file assumes that all but these flags aren't relevant
1440 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441 * time we reach here */
1442 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1444 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1446 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447 NULL, NULL, NULL, FALSE);
1449 /* Make sure is clone-safe */
1450 ssc->invlist = NULL;
1452 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453 ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1456 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1463 ? (TRIE_LIST_CUR( idx ) - 1) \
1469 dump_trie(trie,widecharmap,revcharmap)
1470 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1473 These routines dump out a trie in a somewhat readable format.
1474 The _interim_ variants are used for debugging the interim
1475 tables that are used to generate the final compressed
1476 representation which is what dump_trie expects.
1478 Part of the reason for their existence is to provide a form
1479 of documentation as to how the different representations function.
1484 Dumps the final compressed table form of the trie to Perl_debug_log.
1485 Used for debugging make_trie().
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490 AV *revcharmap, U32 depth)
1493 SV *sv=sv_newmortal();
1494 int colwidth= widecharmap ? 6 : 4;
1496 GET_RE_DEBUG_FLAGS_DECL;
1498 PERL_ARGS_ASSERT_DUMP_TRIE;
1500 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501 (int)depth * 2 + 2,"",
1502 "Match","Base","Ofs" );
1504 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505 SV ** const tmp = av_fetch( revcharmap, state, 0);
1507 PerlIO_printf( Perl_debug_log, "%*s",
1509 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510 PL_colors[0], PL_colors[1],
1511 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512 PERL_PV_ESCAPE_FIRSTCHAR
1517 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518 (int)depth * 2 + 2,"");
1520 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522 PerlIO_printf( Perl_debug_log, "\n");
1524 for( state = 1 ; state < trie->statecount ; state++ ) {
1525 const U32 base = trie->states[ state ].trans.base;
1527 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528 (int)depth * 2 + 2,"", (UV)state);
1530 if ( trie->states[ state ].wordnum ) {
1531 PerlIO_printf( Perl_debug_log, " W%4X",
1532 trie->states[ state ].wordnum );
1534 PerlIO_printf( Perl_debug_log, "%6s", "" );
1537 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1542 while( ( base + ofs < trie->uniquecharcount ) ||
1543 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1548 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1550 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551 if ( ( base + ofs >= trie->uniquecharcount )
1552 && ( base + ofs - trie->uniquecharcount
1554 && trie->trans[ base + ofs
1555 - trie->uniquecharcount ].check == state )
1557 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1559 (UV)trie->trans[ base + ofs
1560 - trie->uniquecharcount ].next );
1562 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1566 PerlIO_printf( Perl_debug_log, "]");
1569 PerlIO_printf( Perl_debug_log, "\n" );
1571 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1573 for (word=1; word <= trie->wordcount; word++) {
1574 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575 (int)word, (int)(trie->wordinfo[word].prev),
1576 (int)(trie->wordinfo[word].len));
1578 PerlIO_printf(Perl_debug_log, "\n" );
1581 Dumps a fully constructed but uncompressed trie in list form.
1582 List tries normally only are used for construction when the number of
1583 possible chars (trie->uniquecharcount) is very high.
1584 Used for debugging make_trie().
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1592 SV *sv=sv_newmortal();
1593 int colwidth= widecharmap ? 6 : 4;
1594 GET_RE_DEBUG_FLAGS_DECL;
1596 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1598 /* print out the table precompression. */
1599 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601 "------:-----+-----------------\n" );
1603 for( state=1 ; state < next_alloc ; state ++ ) {
1606 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607 (int)depth * 2 + 2,"", (UV)state );
1608 if ( ! trie->states[ state ].wordnum ) {
1609 PerlIO_printf( Perl_debug_log, "%5s| ","");
1611 PerlIO_printf( Perl_debug_log, "W%4x| ",
1612 trie->states[ state ].wordnum
1615 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616 SV ** const tmp = av_fetch( revcharmap,
1617 TRIE_LIST_ITEM(state,charid).forid, 0);
1619 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1621 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1623 PL_colors[0], PL_colors[1],
1624 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625 | PERL_PV_ESCAPE_FIRSTCHAR
1627 TRIE_LIST_ITEM(state,charid).forid,
1628 (UV)TRIE_LIST_ITEM(state,charid).newstate
1631 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632 (int)((depth * 2) + 14), "");
1635 PerlIO_printf( Perl_debug_log, "\n");
1640 Dumps a fully constructed but uncompressed trie in table form.
1641 This is the normal DFA style state transition table, with a few
1642 twists to facilitate compression later.
1643 Used for debugging make_trie().
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1652 SV *sv=sv_newmortal();
1653 int colwidth= widecharmap ? 6 : 4;
1654 GET_RE_DEBUG_FLAGS_DECL;
1656 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1659 print out the table precompression so that we can do a visual check
1660 that they are identical.
1663 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1665 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1668 PerlIO_printf( Perl_debug_log, "%*s",
1670 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671 PL_colors[0], PL_colors[1],
1672 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673 PERL_PV_ESCAPE_FIRSTCHAR
1679 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1681 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1685 PerlIO_printf( Perl_debug_log, "\n" );
1687 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1689 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690 (int)depth * 2 + 2,"",
1691 (UV)TRIE_NODENUM( state ) );
1693 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1696 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1698 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1700 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702 (UV)trie->trans[ state ].check );
1704 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705 (UV)trie->trans[ state ].check,
1706 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715 startbranch: the first branch in the whole branch sequence
1716 first : start branch of sequence of branch-exact nodes.
1717 May be the same as startbranch
1718 last : Thing following the last branch.
1719 May be the same as tail.
1720 tail : item following the branch sequence
1721 count : words in the sequence
1722 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723 depth : indent depth
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1739 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1743 (1) +-i->(6)-+-s->[7]
1745 +-s->(3)-+-h->(4)-+-e->[5]
1747 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1760 / (DUPE|DUPE) X? (?{ ... }) Y /x
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1767 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1769 which prints out 'word' three times, but
1771 'words'=~/(word|word|word)(?{ print $1 })S/
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1775 Example of what happens on a structural level:
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1779 1: CURLYM[1] {1,32767}(18)
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1793 1: CURLYM[1] {1,32767}(18)
1795 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1817 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1824 d = uvchr_to_utf8_flags(d, uv, 0);
1826 is the recommended Unicode-aware way of saying
1831 #define TRIE_STORE_REVCHAR(val) \
1834 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1835 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1836 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837 SvCUR_set(zlopp, kapow - flrbbbbb); \
1840 av_push(revcharmap, zlopp); \
1842 char ooooff = (char)val; \
1843 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1847 /* This gets the next character from the input, folding it if not already
1849 #define TRIE_READ_CHAR STMT_START { \
1852 /* if it is UTF then it is either already folded, or does not need \
1854 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1856 else if (folder == PL_fold_latin1) { \
1857 /* This folder implies Unicode rules, which in the range expressible \
1858 * by not UTF is the lower case, with the two exceptions, one of \
1859 * which should have been taken care of before calling this */ \
1860 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1861 uvc = toLOWER_L1(*uc); \
1862 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1865 /* raw data, will be folded later if needed */ \
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1874 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1875 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1876 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1878 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1879 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1880 TRIE_LIST_CUR( state )++; \
1883 #define TRIE_LIST_NEW(state) STMT_START { \
1884 Newxz( trie->states[ state ].trans.list, \
1885 4, reg_trie_trans_le ); \
1886 TRIE_LIST_CUR( state ) = 1; \
1887 TRIE_LIST_LEN( state ) = 4; \
1890 #define TRIE_HANDLE_WORD(state) STMT_START { \
1891 U16 dupe= trie->states[ state ].wordnum; \
1892 regnode * const noper_next = regnext( noper ); \
1895 /* store the word for dumping */ \
1897 if (OP(noper) != NOTHING) \
1898 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1900 tmp = newSVpvn_utf8( "", 0, UTF ); \
1901 av_push( trie_words, tmp ); \
1905 trie->wordinfo[curword].prev = 0; \
1906 trie->wordinfo[curword].len = wordlen; \
1907 trie->wordinfo[curword].accept = state; \
1909 if ( noper_next < tail ) { \
1911 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1913 trie->jump[curword] = (U16)(noper_next - convert); \
1915 jumper = noper_next; \
1917 nextbranch= regnext(cur); \
1921 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1922 /* chain, so that when the bits of chain are later */\
1923 /* linked together, the dups appear in the chain */\
1924 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925 trie->wordinfo[dupe].prev = curword; \
1927 /* we haven't inserted this word yet. */ \
1928 trie->states[ state ].wordnum = curword; \
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1934 ( ( base + charid >= ucharcount \
1935 && base + charid < ubound \
1936 && state == trie->trans[ base - ucharcount + charid ].check \
1937 && trie->trans[ base - ucharcount + charid ].next ) \
1938 ? trie->trans[ base - ucharcount + charid ].next \
1939 : ( state==1 ? special : 0 ) \
1943 #define MADE_JUMP_TRIE 2
1944 #define MADE_EXACT_TRIE 4
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948 regnode *first, regnode *last, regnode *tail,
1949 U32 word_count, U32 flags, U32 depth)
1951 /* first pass, loop through and scan words */
1952 reg_trie_data *trie;
1953 HV *widecharmap = NULL;
1954 AV *revcharmap = newAV();
1960 regnode *jumper = NULL;
1961 regnode *nextbranch = NULL;
1962 regnode *convert = NULL;
1963 U32 *prev_states; /* temp array mapping each state to previous one */
1964 /* we just use folder as a flag in utf8 */
1965 const U8 * folder = NULL;
1968 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1969 AV *trie_words = NULL;
1970 /* along with revcharmap, this only used during construction but both are
1971 * useful during debugging so we store them in the struct when debugging.
1974 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1975 STRLEN trie_charcount=0;
1977 SV *re_trie_maxbuff;
1978 GET_RE_DEBUG_FLAGS_DECL;
1980 PERL_ARGS_ASSERT_MAKE_TRIE;
1982 PERL_UNUSED_ARG(depth);
1989 case EXACTFU: folder = PL_fold_latin1; break;
1990 case EXACTF: folder = PL_fold; break;
1991 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1994 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1996 trie->startstate = 1;
1997 trie->wordcount = word_count;
1998 RExC_rxi->data->data[ data_slot ] = (void*)trie;
1999 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2001 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2002 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2003 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2006 trie_words = newAV();
2009 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2010 assert(re_trie_maxbuff);
2011 if (!SvIOK(re_trie_maxbuff)) {
2012 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2014 DEBUG_TRIE_COMPILE_r({
2015 PerlIO_printf( Perl_debug_log,
2016 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2017 (int)depth * 2 + 2, "",
2018 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2019 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2022 /* Find the node we are going to overwrite */
2023 if ( first == startbranch && OP( last ) != BRANCH ) {
2024 /* whole branch chain */
2027 /* branch sub-chain */
2028 convert = NEXTOPER( first );
2031 /* -- First loop and Setup --
2033 We first traverse the branches and scan each word to determine if it
2034 contains widechars, and how many unique chars there are, this is
2035 important as we have to build a table with at least as many columns as we
2038 We use an array of integers to represent the character codes 0..255
2039 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2040 the native representation of the character value as the key and IV's for
2043 *TODO* If we keep track of how many times each character is used we can
2044 remap the columns so that the table compression later on is more
2045 efficient in terms of memory by ensuring the most common value is in the
2046 middle and the least common are on the outside. IMO this would be better
2047 than a most to least common mapping as theres a decent chance the most
2048 common letter will share a node with the least common, meaning the node
2049 will not be compressible. With a middle is most common approach the worst
2050 case is when we have the least common nodes twice.
2054 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2055 regnode *noper = NEXTOPER( cur );
2056 const U8 *uc = (U8*)STRING( noper );
2057 const U8 *e = uc + STR_LEN( noper );
2059 U32 wordlen = 0; /* required init */
2060 STRLEN minchars = 0;
2061 STRLEN maxchars = 0;
2062 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2065 if (OP(noper) == NOTHING) {
2066 regnode *noper_next= regnext(noper);
2067 if (noper_next != tail && OP(noper_next) == flags) {
2069 uc= (U8*)STRING(noper);
2070 e= uc + STR_LEN(noper);
2071 trie->minlen= STR_LEN(noper);
2078 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2079 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2080 regardless of encoding */
2081 if (OP( noper ) == EXACTFU_SS) {
2082 /* false positives are ok, so just set this */
2083 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2086 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2088 TRIE_CHARCOUNT(trie)++;
2091 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2092 * is in effect. Under /i, this character can match itself, or
2093 * anything that folds to it. If not under /i, it can match just
2094 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2095 * all fold to k, and all are single characters. But some folds
2096 * expand to more than one character, so for example LATIN SMALL
2097 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2098 * the string beginning at 'uc' is 'ffi', it could be matched by
2099 * three characters, or just by the one ligature character. (It
2100 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2101 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2102 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2103 * match.) The trie needs to know the minimum and maximum number
2104 * of characters that could match so that it can use size alone to
2105 * quickly reject many match attempts. The max is simple: it is
2106 * the number of folded characters in this branch (since a fold is
2107 * never shorter than what folds to it. */
2111 /* And the min is equal to the max if not under /i (indicated by
2112 * 'folder' being NULL), or there are no multi-character folds. If
2113 * there is a multi-character fold, the min is incremented just
2114 * once, for the character that folds to the sequence. Each
2115 * character in the sequence needs to be added to the list below of
2116 * characters in the trie, but we count only the first towards the
2117 * min number of characters needed. This is done through the
2118 * variable 'foldlen', which is returned by the macros that look
2119 * for these sequences as the number of bytes the sequence
2120 * occupies. Each time through the loop, we decrement 'foldlen' by
2121 * how many bytes the current char occupies. Only when it reaches
2122 * 0 do we increment 'minchars' or look for another multi-character
2124 if (folder == NULL) {
2127 else if (foldlen > 0) {
2128 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2133 /* See if *uc is the beginning of a multi-character fold. If
2134 * so, we decrement the length remaining to look at, to account
2135 * for the current character this iteration. (We can use 'uc'
2136 * instead of the fold returned by TRIE_READ_CHAR because for
2137 * non-UTF, the latin1_safe macro is smart enough to account
2138 * for all the unfolded characters, and because for UTF, the
2139 * string will already have been folded earlier in the
2140 * compilation process */
2142 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2143 foldlen -= UTF8SKIP(uc);
2146 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2151 /* The current character (and any potential folds) should be added
2152 * to the possible matching characters for this position in this
2156 U8 folded= folder[ (U8) uvc ];
2157 if ( !trie->charmap[ folded ] ) {
2158 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2159 TRIE_STORE_REVCHAR( folded );
2162 if ( !trie->charmap[ uvc ] ) {
2163 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2164 TRIE_STORE_REVCHAR( uvc );
2167 /* store the codepoint in the bitmap, and its folded
2169 TRIE_BITMAP_SET(trie, uvc);
2171 /* store the folded codepoint */
2172 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2175 /* store first byte of utf8 representation of
2176 variant codepoints */
2177 if (! UVCHR_IS_INVARIANT(uvc)) {
2178 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2181 set_bit = 0; /* We've done our bit :-) */
2185 /* XXX We could come up with the list of code points that fold
2186 * to this using PL_utf8_foldclosures, except not for
2187 * multi-char folds, as there may be multiple combinations
2188 * there that could work, which needs to wait until runtime to
2189 * resolve (The comment about LIGATURE FFI above is such an
2194 widecharmap = newHV();
2196 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2199 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2201 if ( !SvTRUE( *svpp ) ) {
2202 sv_setiv( *svpp, ++trie->uniquecharcount );
2203 TRIE_STORE_REVCHAR(uvc);
2206 } /* end loop through characters in this branch of the trie */
2208 /* We take the min and max for this branch and combine to find the min
2209 * and max for all branches processed so far */
2210 if( cur == first ) {
2211 trie->minlen = minchars;
2212 trie->maxlen = maxchars;
2213 } else if (minchars < trie->minlen) {
2214 trie->minlen = minchars;
2215 } else if (maxchars > trie->maxlen) {
2216 trie->maxlen = maxchars;
2218 } /* end first pass */
2219 DEBUG_TRIE_COMPILE_r(
2220 PerlIO_printf( Perl_debug_log,
2221 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2222 (int)depth * 2 + 2,"",
2223 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2224 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2225 (int)trie->minlen, (int)trie->maxlen )
2229 We now know what we are dealing with in terms of unique chars and
2230 string sizes so we can calculate how much memory a naive
2231 representation using a flat table will take. If it's over a reasonable
2232 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2233 conservative but potentially much slower representation using an array
2236 At the end we convert both representations into the same compressed
2237 form that will be used in regexec.c for matching with. The latter
2238 is a form that cannot be used to construct with but has memory
2239 properties similar to the list form and access properties similar
2240 to the table form making it both suitable for fast searches and
2241 small enough that its feasable to store for the duration of a program.
2243 See the comment in the code where the compressed table is produced
2244 inplace from the flat tabe representation for an explanation of how
2245 the compression works.
2250 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2253 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2254 > SvIV(re_trie_maxbuff) )
2257 Second Pass -- Array Of Lists Representation
2259 Each state will be represented by a list of charid:state records
2260 (reg_trie_trans_le) the first such element holds the CUR and LEN
2261 points of the allocated array. (See defines above).
2263 We build the initial structure using the lists, and then convert
2264 it into the compressed table form which allows faster lookups
2265 (but cant be modified once converted).
2268 STRLEN transcount = 1;
2270 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2271 "%*sCompiling trie using list compiler\n",
2272 (int)depth * 2 + 2, ""));
2274 trie->states = (reg_trie_state *)
2275 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2276 sizeof(reg_trie_state) );
2280 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2282 regnode *noper = NEXTOPER( cur );
2283 U8 *uc = (U8*)STRING( noper );
2284 const U8 *e = uc + STR_LEN( noper );
2285 U32 state = 1; /* required init */
2286 U16 charid = 0; /* sanity init */
2287 U32 wordlen = 0; /* required init */
2289 if (OP(noper) == NOTHING) {
2290 regnode *noper_next= regnext(noper);
2291 if (noper_next != tail && OP(noper_next) == flags) {
2293 uc= (U8*)STRING(noper);
2294 e= uc + STR_LEN(noper);
2298 if (OP(noper) != NOTHING) {
2299 for ( ; uc < e ; uc += len ) {
2304 charid = trie->charmap[ uvc ];
2306 SV** const svpp = hv_fetch( widecharmap,
2313 charid=(U16)SvIV( *svpp );
2316 /* charid is now 0 if we dont know the char read, or
2317 * nonzero if we do */
2324 if ( !trie->states[ state ].trans.list ) {
2325 TRIE_LIST_NEW( state );
2328 check <= TRIE_LIST_USED( state );
2331 if ( TRIE_LIST_ITEM( state, check ).forid
2334 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2339 newstate = next_alloc++;
2340 prev_states[newstate] = state;
2341 TRIE_LIST_PUSH( state, charid, newstate );
2346 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2350 TRIE_HANDLE_WORD(state);
2352 } /* end second pass */
2354 /* next alloc is the NEXT state to be allocated */
2355 trie->statecount = next_alloc;
2356 trie->states = (reg_trie_state *)
2357 PerlMemShared_realloc( trie->states,
2359 * sizeof(reg_trie_state) );
2361 /* and now dump it out before we compress it */
2362 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2363 revcharmap, next_alloc,
2367 trie->trans = (reg_trie_trans *)
2368 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2375 for( state=1 ; state < next_alloc ; state ++ ) {
2379 DEBUG_TRIE_COMPILE_MORE_r(
2380 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2384 if (trie->states[state].trans.list) {
2385 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2389 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2390 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2391 if ( forid < minid ) {
2393 } else if ( forid > maxid ) {
2397 if ( transcount < tp + maxid - minid + 1) {
2399 trie->trans = (reg_trie_trans *)
2400 PerlMemShared_realloc( trie->trans,
2402 * sizeof(reg_trie_trans) );
2403 Zero( trie->trans + (transcount / 2),
2407 base = trie->uniquecharcount + tp - minid;
2408 if ( maxid == minid ) {
2410 for ( ; zp < tp ; zp++ ) {
2411 if ( ! trie->trans[ zp ].next ) {
2412 base = trie->uniquecharcount + zp - minid;
2413 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2415 trie->trans[ zp ].check = state;
2421 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2423 trie->trans[ tp ].check = state;
2428 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2429 const U32 tid = base
2430 - trie->uniquecharcount
2431 + TRIE_LIST_ITEM( state, idx ).forid;
2432 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2434 trie->trans[ tid ].check = state;
2436 tp += ( maxid - minid + 1 );
2438 Safefree(trie->states[ state ].trans.list);
2441 DEBUG_TRIE_COMPILE_MORE_r(
2442 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2445 trie->states[ state ].trans.base=base;
2447 trie->lasttrans = tp + 1;
2451 Second Pass -- Flat Table Representation.
2453 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2454 each. We know that we will need Charcount+1 trans at most to store
2455 the data (one row per char at worst case) So we preallocate both
2456 structures assuming worst case.
2458 We then construct the trie using only the .next slots of the entry
2461 We use the .check field of the first entry of the node temporarily
2462 to make compression both faster and easier by keeping track of how
2463 many non zero fields are in the node.
2465 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2468 There are two terms at use here: state as a TRIE_NODEIDX() which is
2469 a number representing the first entry of the node, and state as a
2470 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2471 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2472 if there are 2 entrys per node. eg:
2480 The table is internally in the right hand, idx form. However as we
2481 also have to deal with the states array which is indexed by nodenum
2482 we have to use TRIE_NODENUM() to convert.
2485 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2486 "%*sCompiling trie using table compiler\n",
2487 (int)depth * 2 + 2, ""));
2489 trie->trans = (reg_trie_trans *)
2490 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2491 * trie->uniquecharcount + 1,
2492 sizeof(reg_trie_trans) );
2493 trie->states = (reg_trie_state *)
2494 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2495 sizeof(reg_trie_state) );
2496 next_alloc = trie->uniquecharcount + 1;
2499 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2501 regnode *noper = NEXTOPER( cur );
2502 const U8 *uc = (U8*)STRING( noper );
2503 const U8 *e = uc + STR_LEN( noper );
2505 U32 state = 1; /* required init */
2507 U16 charid = 0; /* sanity init */
2508 U32 accept_state = 0; /* sanity init */
2510 U32 wordlen = 0; /* required init */
2512 if (OP(noper) == NOTHING) {
2513 regnode *noper_next= regnext(noper);
2514 if (noper_next != tail && OP(noper_next) == flags) {
2516 uc= (U8*)STRING(noper);
2517 e= uc + STR_LEN(noper);
2521 if ( OP(noper) != NOTHING ) {
2522 for ( ; uc < e ; uc += len ) {
2527 charid = trie->charmap[ uvc ];
2529 SV* const * const svpp = hv_fetch( widecharmap,
2533 charid = svpp ? (U16)SvIV(*svpp) : 0;
2537 if ( !trie->trans[ state + charid ].next ) {
2538 trie->trans[ state + charid ].next = next_alloc;
2539 trie->trans[ state ].check++;
2540 prev_states[TRIE_NODENUM(next_alloc)]
2541 = TRIE_NODENUM(state);
2542 next_alloc += trie->uniquecharcount;
2544 state = trie->trans[ state + charid ].next;
2546 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2548 /* charid is now 0 if we dont know the char read, or
2549 * nonzero if we do */
2552 accept_state = TRIE_NODENUM( state );
2553 TRIE_HANDLE_WORD(accept_state);
2555 } /* end second pass */
2557 /* and now dump it out before we compress it */
2558 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2560 next_alloc, depth+1));
2564 * Inplace compress the table.*
2566 For sparse data sets the table constructed by the trie algorithm will
2567 be mostly 0/FAIL transitions or to put it another way mostly empty.
2568 (Note that leaf nodes will not contain any transitions.)
2570 This algorithm compresses the tables by eliminating most such
2571 transitions, at the cost of a modest bit of extra work during lookup:
2573 - Each states[] entry contains a .base field which indicates the
2574 index in the state[] array wheres its transition data is stored.
2576 - If .base is 0 there are no valid transitions from that node.
2578 - If .base is nonzero then charid is added to it to find an entry in
2581 -If trans[states[state].base+charid].check!=state then the
2582 transition is taken to be a 0/Fail transition. Thus if there are fail
2583 transitions at the front of the node then the .base offset will point
2584 somewhere inside the previous nodes data (or maybe even into a node
2585 even earlier), but the .check field determines if the transition is
2589 The following process inplace converts the table to the compressed
2590 table: We first do not compress the root node 1,and mark all its
2591 .check pointers as 1 and set its .base pointer as 1 as well. This
2592 allows us to do a DFA construction from the compressed table later,
2593 and ensures that any .base pointers we calculate later are greater
2596 - We set 'pos' to indicate the first entry of the second node.
2598 - We then iterate over the columns of the node, finding the first and
2599 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2600 and set the .check pointers accordingly, and advance pos
2601 appropriately and repreat for the next node. Note that when we copy
2602 the next pointers we have to convert them from the original
2603 NODEIDX form to NODENUM form as the former is not valid post
2606 - If a node has no transitions used we mark its base as 0 and do not
2607 advance the pos pointer.
2609 - If a node only has one transition we use a second pointer into the
2610 structure to fill in allocated fail transitions from other states.
2611 This pointer is independent of the main pointer and scans forward
2612 looking for null transitions that are allocated to a state. When it
2613 finds one it writes the single transition into the "hole". If the
2614 pointer doesnt find one the single transition is appended as normal.
2616 - Once compressed we can Renew/realloc the structures to release the
2619 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2620 specifically Fig 3.47 and the associated pseudocode.
2624 const U32 laststate = TRIE_NODENUM( next_alloc );
2627 trie->statecount = laststate;
2629 for ( state = 1 ; state < laststate ; state++ ) {
2631 const U32 stateidx = TRIE_NODEIDX( state );
2632 const U32 o_used = trie->trans[ stateidx ].check;
2633 U32 used = trie->trans[ stateidx ].check;
2634 trie->trans[ stateidx ].check = 0;
2637 used && charid < trie->uniquecharcount;
2640 if ( flag || trie->trans[ stateidx + charid ].next ) {
2641 if ( trie->trans[ stateidx + charid ].next ) {
2643 for ( ; zp < pos ; zp++ ) {
2644 if ( ! trie->trans[ zp ].next ) {
2648 trie->states[ state ].trans.base
2650 + trie->uniquecharcount
2652 trie->trans[ zp ].next
2653 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2655 trie->trans[ zp ].check = state;
2656 if ( ++zp > pos ) pos = zp;
2663 trie->states[ state ].trans.base
2664 = pos + trie->uniquecharcount - charid ;
2666 trie->trans[ pos ].next
2667 = SAFE_TRIE_NODENUM(
2668 trie->trans[ stateidx + charid ].next );
2669 trie->trans[ pos ].check = state;
2674 trie->lasttrans = pos + 1;
2675 trie->states = (reg_trie_state *)
2676 PerlMemShared_realloc( trie->states, laststate
2677 * sizeof(reg_trie_state) );
2678 DEBUG_TRIE_COMPILE_MORE_r(
2679 PerlIO_printf( Perl_debug_log,
2680 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2681 (int)depth * 2 + 2,"",
2682 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2686 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2689 } /* end table compress */
2691 DEBUG_TRIE_COMPILE_MORE_r(
2692 PerlIO_printf(Perl_debug_log,
2693 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2694 (int)depth * 2 + 2, "",
2695 (UV)trie->statecount,
2696 (UV)trie->lasttrans)
2698 /* resize the trans array to remove unused space */
2699 trie->trans = (reg_trie_trans *)
2700 PerlMemShared_realloc( trie->trans, trie->lasttrans
2701 * sizeof(reg_trie_trans) );
2703 { /* Modify the program and insert the new TRIE node */
2704 U8 nodetype =(U8)(flags & 0xFF);
2708 regnode *optimize = NULL;
2709 #ifdef RE_TRACK_PATTERN_OFFSETS
2712 U32 mjd_nodelen = 0;
2713 #endif /* RE_TRACK_PATTERN_OFFSETS */
2714 #endif /* DEBUGGING */
2716 This means we convert either the first branch or the first Exact,
2717 depending on whether the thing following (in 'last') is a branch
2718 or not and whther first is the startbranch (ie is it a sub part of
2719 the alternation or is it the whole thing.)
2720 Assuming its a sub part we convert the EXACT otherwise we convert
2721 the whole branch sequence, including the first.
2723 /* Find the node we are going to overwrite */
2724 if ( first != startbranch || OP( last ) == BRANCH ) {
2725 /* branch sub-chain */
2726 NEXT_OFF( first ) = (U16)(last - first);
2727 #ifdef RE_TRACK_PATTERN_OFFSETS
2729 mjd_offset= Node_Offset((convert));
2730 mjd_nodelen= Node_Length((convert));
2733 /* whole branch chain */
2735 #ifdef RE_TRACK_PATTERN_OFFSETS
2738 const regnode *nop = NEXTOPER( convert );
2739 mjd_offset= Node_Offset((nop));
2740 mjd_nodelen= Node_Length((nop));
2744 PerlIO_printf(Perl_debug_log,
2745 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2746 (int)depth * 2 + 2, "",
2747 (UV)mjd_offset, (UV)mjd_nodelen)
2750 /* But first we check to see if there is a common prefix we can
2751 split out as an EXACT and put in front of the TRIE node. */
2752 trie->startstate= 1;
2753 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2755 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2759 const U32 base = trie->states[ state ].trans.base;
2761 if ( trie->states[state].wordnum )
2764 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2765 if ( ( base + ofs >= trie->uniquecharcount ) &&
2766 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2767 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2769 if ( ++count > 1 ) {
2770 SV **tmp = av_fetch( revcharmap, ofs, 0);
2771 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2772 if ( state == 1 ) break;
2774 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2776 PerlIO_printf(Perl_debug_log,
2777 "%*sNew Start State=%"UVuf" Class: [",
2778 (int)depth * 2 + 2, "",
2781 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2782 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2784 TRIE_BITMAP_SET(trie,*ch);
2786 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2788 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2792 TRIE_BITMAP_SET(trie,*ch);
2794 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2795 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2801 SV **tmp = av_fetch( revcharmap, idx, 0);
2803 char *ch = SvPV( *tmp, len );
2805 SV *sv=sv_newmortal();
2806 PerlIO_printf( Perl_debug_log,
2807 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2808 (int)depth * 2 + 2, "",
2810 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2811 PL_colors[0], PL_colors[1],
2812 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2813 PERL_PV_ESCAPE_FIRSTCHAR
2818 OP( convert ) = nodetype;
2819 str=STRING(convert);
2822 STR_LEN(convert) += len;
2828 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2833 trie->prefixlen = (state-1);
2835 regnode *n = convert+NODE_SZ_STR(convert);
2836 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2837 trie->startstate = state;
2838 trie->minlen -= (state - 1);
2839 trie->maxlen -= (state - 1);
2841 /* At least the UNICOS C compiler choked on this
2842 * being argument to DEBUG_r(), so let's just have
2845 #ifdef PERL_EXT_RE_BUILD
2851 regnode *fix = convert;
2852 U32 word = trie->wordcount;
2854 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2855 while( ++fix < n ) {
2856 Set_Node_Offset_Length(fix, 0, 0);
2859 SV ** const tmp = av_fetch( trie_words, word, 0 );
2861 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2862 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2864 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2872 NEXT_OFF(convert) = (U16)(tail - convert);
2873 DEBUG_r(optimize= n);
2879 if ( trie->maxlen ) {
2880 NEXT_OFF( convert ) = (U16)(tail - convert);
2881 ARG_SET( convert, data_slot );
2882 /* Store the offset to the first unabsorbed branch in
2883 jump[0], which is otherwise unused by the jump logic.
2884 We use this when dumping a trie and during optimisation. */
2886 trie->jump[0] = (U16)(nextbranch - convert);
2888 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2889 * and there is a bitmap
2890 * and the first "jump target" node we found leaves enough room
2891 * then convert the TRIE node into a TRIEC node, with the bitmap
2892 * embedded inline in the opcode - this is hypothetically faster.
2894 if ( !trie->states[trie->startstate].wordnum
2896 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2898 OP( convert ) = TRIEC;
2899 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2900 PerlMemShared_free(trie->bitmap);
2903 OP( convert ) = TRIE;
2905 /* store the type in the flags */
2906 convert->flags = nodetype;
2910 + regarglen[ OP( convert ) ];
2912 /* XXX We really should free up the resource in trie now,
2913 as we won't use them - (which resources?) dmq */
2915 /* needed for dumping*/
2916 DEBUG_r(if (optimize) {
2917 regnode *opt = convert;
2919 while ( ++opt < optimize) {
2920 Set_Node_Offset_Length(opt,0,0);
2923 Try to clean up some of the debris left after the
2926 while( optimize < jumper ) {
2927 mjd_nodelen += Node_Length((optimize));
2928 OP( optimize ) = OPTIMIZED;
2929 Set_Node_Offset_Length(optimize,0,0);
2932 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2934 } /* end node insert */
2936 /* Finish populating the prev field of the wordinfo array. Walk back
2937 * from each accept state until we find another accept state, and if
2938 * so, point the first word's .prev field at the second word. If the
2939 * second already has a .prev field set, stop now. This will be the
2940 * case either if we've already processed that word's accept state,
2941 * or that state had multiple words, and the overspill words were
2942 * already linked up earlier.
2949 for (word=1; word <= trie->wordcount; word++) {
2951 if (trie->wordinfo[word].prev)
2953 state = trie->wordinfo[word].accept;
2955 state = prev_states[state];
2958 prev = trie->states[state].wordnum;
2962 trie->wordinfo[word].prev = prev;
2964 Safefree(prev_states);
2968 /* and now dump out the compressed format */
2969 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2971 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2973 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2974 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2976 SvREFCNT_dec_NN(revcharmap);
2980 : trie->startstate>1
2986 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2988 /* The Trie is constructed and compressed now so we can build a fail array if
2991 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2993 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2997 We find the fail state for each state in the trie, this state is the longest
2998 proper suffix of the current state's 'word' that is also a proper prefix of
2999 another word in our trie. State 1 represents the word '' and is thus the
3000 default fail state. This allows the DFA not to have to restart after its
3001 tried and failed a word at a given point, it simply continues as though it
3002 had been matching the other word in the first place.
3004 'abcdgu'=~/abcdefg|cdgu/
3005 When we get to 'd' we are still matching the first word, we would encounter
3006 'g' which would fail, which would bring us to the state representing 'd' in
3007 the second word where we would try 'g' and succeed, proceeding to match
3010 /* add a fail transition */
3011 const U32 trie_offset = ARG(source);
3012 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3014 const U32 ucharcount = trie->uniquecharcount;
3015 const U32 numstates = trie->statecount;
3016 const U32 ubound = trie->lasttrans + ucharcount;
3020 U32 base = trie->states[ 1 ].trans.base;
3023 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3025 GET_RE_DEBUG_FLAGS_DECL;
3027 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3028 PERL_UNUSED_CONTEXT;
3030 PERL_UNUSED_ARG(depth);
3033 if ( OP(source) == TRIE ) {
3034 struct regnode_1 *op = (struct regnode_1 *)
3035 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3036 StructCopy(source,op,struct regnode_1);
3037 stclass = (regnode *)op;
3039 struct regnode_charclass *op = (struct regnode_charclass *)
3040 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3041 StructCopy(source,op,struct regnode_charclass);
3042 stclass = (regnode *)op;
3044 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3046 ARG_SET( stclass, data_slot );
3047 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3048 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3049 aho->trie=trie_offset;
3050 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3051 Copy( trie->states, aho->states, numstates, reg_trie_state );
3052 Newxz( q, numstates, U32);
3053 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3056 /* initialize fail[0..1] to be 1 so that we always have
3057 a valid final fail state */
3058 fail[ 0 ] = fail[ 1 ] = 1;
3060 for ( charid = 0; charid < ucharcount ; charid++ ) {
3061 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3063 q[ q_write ] = newstate;
3064 /* set to point at the root */
3065 fail[ q[ q_write++ ] ]=1;
3068 while ( q_read < q_write) {
3069 const U32 cur = q[ q_read++ % numstates ];
3070 base = trie->states[ cur ].trans.base;
3072 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3073 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3075 U32 fail_state = cur;
3078 fail_state = fail[ fail_state ];
3079 fail_base = aho->states[ fail_state ].trans.base;
3080 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3082 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3083 fail[ ch_state ] = fail_state;
3084 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3086 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3088 q[ q_write++ % numstates] = ch_state;
3092 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3093 when we fail in state 1, this allows us to use the
3094 charclass scan to find a valid start char. This is based on the principle
3095 that theres a good chance the string being searched contains lots of stuff
3096 that cant be a start char.
3098 fail[ 0 ] = fail[ 1 ] = 0;
3099 DEBUG_TRIE_COMPILE_r({
3100 PerlIO_printf(Perl_debug_log,
3101 "%*sStclass Failtable (%"UVuf" states): 0",
3102 (int)(depth * 2), "", (UV)numstates
3104 for( q_read=1; q_read<numstates; q_read++ ) {
3105 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3107 PerlIO_printf(Perl_debug_log, "\n");
3110 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3115 #define DEBUG_PEEP(str,scan,depth) \
3116 DEBUG_OPTIMISE_r({if (scan){ \
3117 SV * const mysv=sv_newmortal(); \
3118 regnode *Next = regnext(scan); \
3119 regprop(RExC_rx, mysv, scan, NULL); \
3120 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3121 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3122 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3126 /* The below joins as many adjacent EXACTish nodes as possible into a single
3127 * one. The regop may be changed if the node(s) contain certain sequences that
3128 * require special handling. The joining is only done if:
3129 * 1) there is room in the current conglomerated node to entirely contain the
3131 * 2) they are the exact same node type
3133 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3134 * these get optimized out
3136 * If a node is to match under /i (folded), the number of characters it matches
3137 * can be different than its character length if it contains a multi-character
3138 * fold. *min_subtract is set to the total delta number of characters of the
3141 * And *unfolded_multi_char is set to indicate whether or not the node contains
3142 * an unfolded multi-char fold. This happens when whether the fold is valid or
3143 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3144 * SMALL LETTER SHARP S, as only if the target string being matched against
3145 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3146 * folding rules depend on the locale in force at runtime. (Multi-char folds
3147 * whose components are all above the Latin1 range are not run-time locale
3148 * dependent, and have already been folded by the time this function is
3151 * This is as good a place as any to discuss the design of handling these
3152 * multi-character fold sequences. It's been wrong in Perl for a very long
3153 * time. There are three code points in Unicode whose multi-character folds
3154 * were long ago discovered to mess things up. The previous designs for
3155 * dealing with these involved assigning a special node for them. This
3156 * approach doesn't always work, as evidenced by this example:
3157 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3158 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3159 * would match just the \xDF, it won't be able to handle the case where a
3160 * successful match would have to cross the node's boundary. The new approach
3161 * that hopefully generally solves the problem generates an EXACTFU_SS node
3162 * that is "sss" in this case.
3164 * It turns out that there are problems with all multi-character folds, and not
3165 * just these three. Now the code is general, for all such cases. The
3166 * approach taken is:
3167 * 1) This routine examines each EXACTFish node that could contain multi-
3168 * character folded sequences. Since a single character can fold into
3169 * such a sequence, the minimum match length for this node is less than
3170 * the number of characters in the node. This routine returns in
3171 * *min_subtract how many characters to subtract from the the actual
3172 * length of the string to get a real minimum match length; it is 0 if
3173 * there are no multi-char foldeds. This delta is used by the caller to
3174 * adjust the min length of the match, and the delta between min and max,
3175 * so that the optimizer doesn't reject these possibilities based on size
3177 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3178 * is used for an EXACTFU node that contains at least one "ss" sequence in
3179 * it. For non-UTF-8 patterns and strings, this is the only case where
3180 * there is a possible fold length change. That means that a regular
3181 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3182 * with length changes, and so can be processed faster. regexec.c takes
3183 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3184 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3185 * known until runtime). This saves effort in regex matching. However,
3186 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3187 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3188 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3189 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3190 * possibilities for the non-UTF8 patterns are quite simple, except for
3191 * the sharp s. All the ones that don't involve a UTF-8 target string are
3192 * members of a fold-pair, and arrays are set up for all of them so that
3193 * the other member of the pair can be found quickly. Code elsewhere in
3194 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3195 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3196 * described in the next item.
3197 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3198 * validity of the fold won't be known until runtime, and so must remain
3199 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3200 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3201 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3202 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3203 * The reason this is a problem is that the optimizer part of regexec.c
3204 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3205 * that a character in the pattern corresponds to at most a single
3206 * character in the target string. (And I do mean character, and not byte
3207 * here, unlike other parts of the documentation that have never been
3208 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3209 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3210 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3211 * nodes, violate the assumption, and they are the only instances where it
3212 * is violated. I'm reluctant to try to change the assumption, as the
3213 * code involved is impenetrable to me (khw), so instead the code here
3214 * punts. This routine examines EXACTFL nodes, and (when the pattern
3215 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3216 * boolean indicating whether or not the node contains such a fold. When
3217 * it is true, the caller sets a flag that later causes the optimizer in
3218 * this file to not set values for the floating and fixed string lengths,
3219 * and thus avoids the optimizer code in regexec.c that makes the invalid
3220 * assumption. Thus, there is no optimization based on string lengths for
3221 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3222 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3223 * assumption is wrong only in these cases is that all other non-UTF-8
3224 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3225 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3226 * EXACTF nodes because we don't know at compile time if it actually
3227 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3228 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3229 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3230 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3231 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3232 * string would require the pattern to be forced into UTF-8, the overhead
3233 * of which we want to avoid. Similarly the unfolded multi-char folds in
3234 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3237 * Similarly, the code that generates tries doesn't currently handle
3238 * not-already-folded multi-char folds, and it looks like a pain to change
3239 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3240 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3241 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3242 * using /iaa matching will be doing so almost entirely with ASCII
3243 * strings, so this should rarely be encountered in practice */
3245 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3246 if (PL_regkind[OP(scan)] == EXACT) \
3247 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3250 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3251 UV *min_subtract, bool *unfolded_multi_char,
3252 U32 flags,regnode *val, U32 depth)
3254 /* Merge several consecutive EXACTish nodes into one. */
3255 regnode *n = regnext(scan);
3257 regnode *next = scan + NODE_SZ_STR(scan);
3261 regnode *stop = scan;
3262 GET_RE_DEBUG_FLAGS_DECL;
3264 PERL_UNUSED_ARG(depth);
3267 PERL_ARGS_ASSERT_JOIN_EXACT;
3268 #ifndef EXPERIMENTAL_INPLACESCAN
3269 PERL_UNUSED_ARG(flags);
3270 PERL_UNUSED_ARG(val);
3272 DEBUG_PEEP("join",scan,depth);
3274 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3275 * EXACT ones that are mergeable to the current one. */
3277 && (PL_regkind[OP(n)] == NOTHING
3278 || (stringok && OP(n) == OP(scan)))
3280 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3283 if (OP(n) == TAIL || n > next)
3285 if (PL_regkind[OP(n)] == NOTHING) {
3286 DEBUG_PEEP("skip:",n,depth);
3287 NEXT_OFF(scan) += NEXT_OFF(n);
3288 next = n + NODE_STEP_REGNODE;
3295 else if (stringok) {
3296 const unsigned int oldl = STR_LEN(scan);
3297 regnode * const nnext = regnext(n);
3299 /* XXX I (khw) kind of doubt that this works on platforms (should
3300 * Perl ever run on one) where U8_MAX is above 255 because of lots
3301 * of other assumptions */
3302 /* Don't join if the sum can't fit into a single node */
3303 if (oldl + STR_LEN(n) > U8_MAX)
3306 DEBUG_PEEP("merg",n,depth);
3309 NEXT_OFF(scan) += NEXT_OFF(n);
3310 STR_LEN(scan) += STR_LEN(n);
3311 next = n + NODE_SZ_STR(n);
3312 /* Now we can overwrite *n : */
3313 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3321 #ifdef EXPERIMENTAL_INPLACESCAN
3322 if (flags && !NEXT_OFF(n)) {
3323 DEBUG_PEEP("atch", val, depth);
3324 if (reg_off_by_arg[OP(n)]) {
3325 ARG_SET(n, val - n);
3328 NEXT_OFF(n) = val - n;
3336 *unfolded_multi_char = FALSE;
3338 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3339 * can now analyze for sequences of problematic code points. (Prior to
3340 * this final joining, sequences could have been split over boundaries, and
3341 * hence missed). The sequences only happen in folding, hence for any
3342 * non-EXACT EXACTish node */
3343 if (OP(scan) != EXACT) {
3344 U8* s0 = (U8*) STRING(scan);
3346 U8* s_end = s0 + STR_LEN(scan);
3348 int total_count_delta = 0; /* Total delta number of characters that
3349 multi-char folds expand to */
3351 /* One pass is made over the node's string looking for all the
3352 * possibilities. To avoid some tests in the loop, there are two main
3353 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3358 if (OP(scan) == EXACTFL) {
3361 /* An EXACTFL node would already have been changed to another
3362 * node type unless there is at least one character in it that
3363 * is problematic; likely a character whose fold definition
3364 * won't be known until runtime, and so has yet to be folded.
3365 * For all but the UTF-8 locale, folds are 1-1 in length, but
3366 * to handle the UTF-8 case, we need to create a temporary
3367 * folded copy using UTF-8 locale rules in order to analyze it.
3368 * This is because our macros that look to see if a sequence is
3369 * a multi-char fold assume everything is folded (otherwise the
3370 * tests in those macros would be too complicated and slow).
3371 * Note that here, the non-problematic folds will have already
3372 * been done, so we can just copy such characters. We actually
3373 * don't completely fold the EXACTFL string. We skip the
3374 * unfolded multi-char folds, as that would just create work
3375 * below to figure out the size they already are */
3377 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3380 STRLEN s_len = UTF8SKIP(s);
3381 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3382 Copy(s, d, s_len, U8);
3385 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3386 *unfolded_multi_char = TRUE;
3387 Copy(s, d, s_len, U8);
3390 else if (isASCII(*s)) {
3391 *(d++) = toFOLD(*s);
3395 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3401 /* Point the remainder of the routine to look at our temporary
3405 } /* End of creating folded copy of EXACTFL string */
3407 /* Examine the string for a multi-character fold sequence. UTF-8
3408 * patterns have all characters pre-folded by the time this code is
3410 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3411 length sequence we are looking for is 2 */
3413 int count = 0; /* How many characters in a multi-char fold */
3414 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3415 if (! len) { /* Not a multi-char fold: get next char */
3420 /* Nodes with 'ss' require special handling, except for
3421 * EXACTFA-ish for which there is no multi-char fold to this */
3422 if (len == 2 && *s == 's' && *(s+1) == 's'
3423 && OP(scan) != EXACTFA
3424 && OP(scan) != EXACTFA_NO_TRIE)
3427 if (OP(scan) != EXACTFL) {
3428 OP(scan) = EXACTFU_SS;
3432 else { /* Here is a generic multi-char fold. */
3433 U8* multi_end = s + len;
3435 /* Count how many characters are in it. In the case of
3436 * /aa, no folds which contain ASCII code points are
3437 * allowed, so check for those, and skip if found. */
3438 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3439 count = utf8_length(s, multi_end);
3443 while (s < multi_end) {
3446 goto next_iteration;
3456 /* The delta is how long the sequence is minus 1 (1 is how long
3457 * the character that folds to the sequence is) */
3458 total_count_delta += count - 1;
3462 /* We created a temporary folded copy of the string in EXACTFL
3463 * nodes. Therefore we need to be sure it doesn't go below zero,
3464 * as the real string could be shorter */
3465 if (OP(scan) == EXACTFL) {
3466 int total_chars = utf8_length((U8*) STRING(scan),
3467 (U8*) STRING(scan) + STR_LEN(scan));
3468 if (total_count_delta > total_chars) {
3469 total_count_delta = total_chars;
3473 *min_subtract += total_count_delta;
3476 else if (OP(scan) == EXACTFA) {
3478 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3479 * fold to the ASCII range (and there are no existing ones in the
3480 * upper latin1 range). But, as outlined in the comments preceding
3481 * this function, we need to flag any occurrences of the sharp s.
3482 * This character forbids trie formation (because of added
3485 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3486 OP(scan) = EXACTFA_NO_TRIE;
3487 *unfolded_multi_char = TRUE;
3496 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3497 * folds that are all Latin1. As explained in the comments
3498 * preceding this function, we look also for the sharp s in EXACTF
3499 * and EXACTFL nodes; it can be in the final position. Otherwise
3500 * we can stop looking 1 byte earlier because have to find at least
3501 * two characters for a multi-fold */
3502 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3507 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3508 if (! len) { /* Not a multi-char fold. */
3509 if (*s == LATIN_SMALL_LETTER_SHARP_S
3510 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3512 *unfolded_multi_char = TRUE;
3519 && isARG2_lower_or_UPPER_ARG1('s', *s)
3520 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3523 /* EXACTF nodes need to know that the minimum length
3524 * changed so that a sharp s in the string can match this
3525 * ss in the pattern, but they remain EXACTF nodes, as they
3526 * won't match this unless the target string is is UTF-8,
3527 * which we don't know until runtime. EXACTFL nodes can't
3528 * transform into EXACTFU nodes */
3529 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3530 OP(scan) = EXACTFU_SS;
3534 *min_subtract += len - 1;
3541 /* Allow dumping but overwriting the collection of skipped
3542 * ops and/or strings with fake optimized ops */
3543 n = scan + NODE_SZ_STR(scan);
3551 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3555 /* REx optimizer. Converts nodes into quicker variants "in place".
3556 Finds fixed substrings. */
3558 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3559 to the position after last scanned or to NULL. */
3561 #define INIT_AND_WITHP \
3562 assert(!and_withp); \
3563 Newx(and_withp,1, regnode_ssc); \
3564 SAVEFREEPV(and_withp)
3566 /* this is a chain of data about sub patterns we are processing that
3567 need to be handled separately/specially in study_chunk. Its so
3568 we can simulate recursion without losing state. */
3570 typedef struct scan_frame {
3571 regnode *last; /* last node to process in this frame */
3572 regnode *next; /* next node to process when last is reached */
3573 struct scan_frame *prev; /*previous frame*/
3574 U32 prev_recursed_depth;
3575 I32 stop; /* what stopparen do we use */
3580 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3581 SSize_t *minlenp, SSize_t *deltap,
3586 regnode_ssc *and_withp,
3587 U32 flags, U32 depth)
3588 /* scanp: Start here (read-write). */
3589 /* deltap: Write maxlen-minlen here. */
3590 /* last: Stop before this one. */
3591 /* data: string data about the pattern */
3592 /* stopparen: treat close N as END */
3593 /* recursed: which subroutines have we recursed into */
3594 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3596 /* There must be at least this number of characters to match */
3599 regnode *scan = *scanp, *next;
3601 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3602 int is_inf_internal = 0; /* The studied chunk is infinite */
3603 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3604 scan_data_t data_fake;
3605 SV *re_trie_maxbuff = NULL;
3606 regnode *first_non_open = scan;
3607 SSize_t stopmin = SSize_t_MAX;
3608 scan_frame *frame = NULL;
3609 GET_RE_DEBUG_FLAGS_DECL;
3611 PERL_ARGS_ASSERT_STUDY_CHUNK;
3614 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3617 while (first_non_open && OP(first_non_open) == OPEN)
3618 first_non_open=regnext(first_non_open);
3623 while ( scan && OP(scan) != END && scan < last ){
3624 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3625 node length to get a real minimum (because
3626 the folded version may be shorter) */
3627 bool unfolded_multi_char = FALSE;
3628 /* Peephole optimizer: */
3629 DEBUG_OPTIMISE_MORE_r(
3631 PerlIO_printf(Perl_debug_log,
3632 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3633 ((int) depth*2), "", (long)stopparen,
3634 (unsigned long)depth, (unsigned long)recursed_depth);
3635 if (recursed_depth) {
3638 for ( j = 0 ; j < recursed_depth ; j++ ) {
3639 PerlIO_printf(Perl_debug_log,"[");
3640 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3641 PerlIO_printf(Perl_debug_log,"%d",
3642 PAREN_TEST(RExC_study_chunk_recursed +
3643 (j * RExC_study_chunk_recursed_bytes), i)
3646 PerlIO_printf(Perl_debug_log,"]");
3649 PerlIO_printf(Perl_debug_log,"\n");
3652 DEBUG_STUDYDATA("Peep:", data, depth);
3653 DEBUG_PEEP("Peep", scan, depth);
3656 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3657 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3658 * by a different invocation of reg() -- Yves
3660 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3662 /* Follow the next-chain of the current node and optimize
3663 away all the NOTHINGs from it. */
3664 if (OP(scan) != CURLYX) {
3665 const int max = (reg_off_by_arg[OP(scan)]
3667 /* I32 may be smaller than U16 on CRAYs! */
3668 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3669 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3673 /* Skip NOTHING and LONGJMP. */
3674 while ((n = regnext(n))
3675 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3676 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3677 && off + noff < max)
3679 if (reg_off_by_arg[OP(scan)])
3682 NEXT_OFF(scan) = off;
3687 /* The principal pseudo-switch. Cannot be a switch, since we
3688 look into several different things. */
3689 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3690 || OP(scan) == IFTHEN) {
3691 next = regnext(scan);
3693 /* demq: the op(next)==code check is to see if we have
3694 * "branch-branch" AFAICT */
3696 if (OP(next) == code || code == IFTHEN) {
3697 /* NOTE - There is similar code to this block below for
3698 * handling TRIE nodes on a re-study. If you change stuff here
3699 * check there too. */
3700 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3702 regnode * const startbranch=scan;
3704 if (flags & SCF_DO_SUBSTR) {
3705 /* Cannot merge strings after this. */
3706 scan_commit(pRExC_state, data, minlenp, is_inf);
3709 if (flags & SCF_DO_STCLASS)
3710 ssc_init_zero(pRExC_state, &accum);
3712 while (OP(scan) == code) {
3713 SSize_t deltanext, minnext, fake;
3715 regnode_ssc this_class;
3718 data_fake.flags = 0;
3720 data_fake.whilem_c = data->whilem_c;
3721 data_fake.last_closep = data->last_closep;
3724 data_fake.last_closep = &fake;
3726 data_fake.pos_delta = delta;
3727 next = regnext(scan);
3728 scan = NEXTOPER(scan);
3730 scan = NEXTOPER(scan);
3731 if (flags & SCF_DO_STCLASS) {
3732 ssc_init(pRExC_state, &this_class);
3733 data_fake.start_class = &this_class;
3734 f = SCF_DO_STCLASS_AND;
3736 if (flags & SCF_WHILEM_VISITED_POS)
3737 f |= SCF_WHILEM_VISITED_POS;
3739 /* we suppose the run is continuous, last=next...*/
3740 minnext = study_chunk(pRExC_state, &scan, minlenp,
3741 &deltanext, next, &data_fake, stopparen,
3742 recursed_depth, NULL, f,depth+1);
3745 if (deltanext == SSize_t_MAX) {
3746 is_inf = is_inf_internal = 1;
3748 } else if (max1 < minnext + deltanext)
3749 max1 = minnext + deltanext;
3751 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3753 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3754 if ( stopmin > minnext)
3755 stopmin = min + min1;
3756 flags &= ~SCF_DO_SUBSTR;
3758 data->flags |= SCF_SEEN_ACCEPT;
3761 if (data_fake.flags & SF_HAS_EVAL)
3762 data->flags |= SF_HAS_EVAL;
3763 data->whilem_c = data_fake.whilem_c;
3765 if (flags & SCF_DO_STCLASS)
3766 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3768 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3770 if (flags & SCF_DO_SUBSTR) {
3771 data->pos_min += min1;
3772 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3773 data->pos_delta = SSize_t_MAX;
3775 data->pos_delta += max1 - min1;
3776 if (max1 != min1 || is_inf)
3777 data->longest = &(data->longest_float);
3780 if (delta == SSize_t_MAX
3781 || SSize_t_MAX - delta - (max1 - min1) < 0)
3782 delta = SSize_t_MAX;
3784 delta += max1 - min1;
3785 if (flags & SCF_DO_STCLASS_OR) {
3786 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3788 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3789 flags &= ~SCF_DO_STCLASS;
3792 else if (flags & SCF_DO_STCLASS_AND) {
3794 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3795 flags &= ~SCF_DO_STCLASS;
3798 /* Switch to OR mode: cache the old value of
3799 * data->start_class */
3801 StructCopy(data->start_class, and_withp, regnode_ssc);
3802 flags &= ~SCF_DO_STCLASS_AND;
3803 StructCopy(&accum, data->start_class, regnode_ssc);
3804 flags |= SCF_DO_STCLASS_OR;
3808 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3809 OP( startbranch ) == BRANCH )
3813 Assuming this was/is a branch we are dealing with: 'scan'
3814 now points at the item that follows the branch sequence,
3815 whatever it is. We now start at the beginning of the
3816 sequence and look for subsequences of
3822 which would be constructed from a pattern like
3825 If we can find such a subsequence we need to turn the first
3826 element into a trie and then add the subsequent branch exact
3827 strings to the trie.
3831 1. patterns where the whole set of branches can be
3834 2. patterns where only a subset can be converted.
3836 In case 1 we can replace the whole set with a single regop
3837 for the trie. In case 2 we need to keep the start and end
3840 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3841 becomes BRANCH TRIE; BRANCH X;
3843 There is an additional case, that being where there is a
3844 common prefix, which gets split out into an EXACT like node
3845 preceding the TRIE node.
3847 If x(1..n)==tail then we can do a simple trie, if not we make
3848 a "jump" trie, such that when we match the appropriate word
3849 we "jump" to the appropriate tail node. Essentially we turn
3850 a nested if into a case structure of sorts.
3855 if (!re_trie_maxbuff) {
3856 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3857 if (!SvIOK(re_trie_maxbuff))
3858 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3860 if ( SvIV(re_trie_maxbuff)>=0 ) {
3862 regnode *first = (regnode *)NULL;
3863 regnode *last = (regnode *)NULL;
3864 regnode *tail = scan;
3869 SV * const mysv = sv_newmortal(); /* for dumping */
3871 /* var tail is used because there may be a TAIL
3872 regop in the way. Ie, the exacts will point to the
3873 thing following the TAIL, but the last branch will
3874 point at the TAIL. So we advance tail. If we
3875 have nested (?:) we may have to move through several
3879 while ( OP( tail ) == TAIL ) {
3880 /* this is the TAIL generated by (?:) */
3881 tail = regnext( tail );
3885 DEBUG_TRIE_COMPILE_r({
3886 regprop(RExC_rx, mysv, tail, NULL);
3887 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3888 (int)depth * 2 + 2, "",
3889 "Looking for TRIE'able sequences. Tail node is: ",
3890 SvPV_nolen_const( mysv )
3896 Step through the branches
3897 cur represents each branch,
3898 noper is the first thing to be matched as part
3900 noper_next is the regnext() of that node.
3902 We normally handle a case like this
3903 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3904 support building with NOJUMPTRIE, which restricts
3905 the trie logic to structures like /FOO|BAR/.
3907 If noper is a trieable nodetype then the branch is
3908 a possible optimization target. If we are building
3909 under NOJUMPTRIE then we require that noper_next is
3910 the same as scan (our current position in the regex
3913 Once we have two or more consecutive such branches
3914 we can create a trie of the EXACT's contents and
3915 stitch it in place into the program.
3917 If the sequence represents all of the branches in
3918 the alternation we replace the entire thing with a
3921 Otherwise when it is a subsequence we need to
3922 stitch it in place and replace only the relevant
3923 branches. This means the first branch has to remain
3924 as it is used by the alternation logic, and its
3925 next pointer, and needs to be repointed at the item
3926 on the branch chain following the last branch we
3927 have optimized away.
3929 This could be either a BRANCH, in which case the
3930 subsequence is internal, or it could be the item
3931 following the branch sequence in which case the
3932 subsequence is at the end (which does not
3933 necessarily mean the first node is the start of the
3936 TRIE_TYPE(X) is a define which maps the optype to a
3940 ----------------+-----------
3944 EXACTFU_SS | EXACTFU
3949 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3950 ( EXACT == (X) ) ? EXACT : \
3951 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3952 ( EXACTFA == (X) ) ? EXACTFA : \
3955 /* dont use tail as the end marker for this traverse */
3956 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3957 regnode * const noper = NEXTOPER( cur );
3958 U8 noper_type = OP( noper );
3959 U8 noper_trietype = TRIE_TYPE( noper_type );
3960 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3961 regnode * const noper_next = regnext( noper );
3962 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3963 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3966 DEBUG_TRIE_COMPILE_r({
3967 regprop(RExC_rx, mysv, cur, NULL);
3968 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3969 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3971 regprop(RExC_rx, mysv, noper, NULL);
3972 PerlIO_printf( Perl_debug_log, " -> %s",
3973 SvPV_nolen_const(mysv));
3976 regprop(RExC_rx, mysv, noper_next, NULL);
3977 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3978 SvPV_nolen_const(mysv));
3980 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3981 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3982 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3986 /* Is noper a trieable nodetype that can be merged
3987 * with the current trie (if there is one)? */
3991 ( noper_trietype == NOTHING)
3992 || ( trietype == NOTHING )
3993 || ( trietype == noper_trietype )
3996 && noper_next == tail
4000 /* Handle mergable triable node Either we are
4001 * the first node in a new trieable sequence,
4002 * in which case we do some bookkeeping,
4003 * otherwise we update the end pointer. */
4006 if ( noper_trietype == NOTHING ) {
4007 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4008 regnode * const noper_next = regnext( noper );
4009 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4010 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4013 if ( noper_next_trietype ) {
4014 trietype = noper_next_trietype;
4015 } else if (noper_next_type) {
4016 /* a NOTHING regop is 1 regop wide.
4017 * We need at least two for a trie
4018 * so we can't merge this in */
4022 trietype = noper_trietype;
4025 if ( trietype == NOTHING )
4026 trietype = noper_trietype;
4031 } /* end handle mergable triable node */
4033 /* handle unmergable node -
4034 * noper may either be a triable node which can
4035 * not be tried together with the current trie,
4036 * or a non triable node */
4038 /* If last is set and trietype is not
4039 * NOTHING then we have found at least two
4040 * triable branch sequences in a row of a
4041 * similar trietype so we can turn them
4042 * into a trie. If/when we allow NOTHING to
4043 * start a trie sequence this condition
4044 * will be required, and it isn't expensive
4045 * so we leave it in for now. */
4046 if ( trietype && trietype != NOTHING )
4047 make_trie( pRExC_state,
4048 startbranch, first, cur, tail,
4049 count, trietype, depth+1 );
4050 last = NULL; /* note: we clear/update
4051 first, trietype etc below,
4052 so we dont do it here */
4056 && noper_next == tail
4059 /* noper is triable, so we can start a new
4063 trietype = noper_trietype;
4065 /* if we already saw a first but the
4066 * current node is not triable then we have
4067 * to reset the first information. */
4072 } /* end handle unmergable node */
4073 } /* loop over branches */
4074 DEBUG_TRIE_COMPILE_r({
4075 regprop(RExC_rx, mysv, cur, NULL);
4076 PerlIO_printf( Perl_debug_log,
4077 "%*s- %s (%d) <SCAN FINISHED>\n",
4079 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4082 if ( last && trietype ) {
4083 if ( trietype != NOTHING ) {
4084 /* the last branch of the sequence was part of
4085 * a trie, so we have to construct it here
4086 * outside of the loop */
4087 made= make_trie( pRExC_state, startbranch,
4088 first, scan, tail, count,
4089 trietype, depth+1 );
4090 #ifdef TRIE_STUDY_OPT
4091 if ( ((made == MADE_EXACT_TRIE &&
4092 startbranch == first)
4093 || ( first_non_open == first )) &&
4095 flags |= SCF_TRIE_RESTUDY;
4096 if ( startbranch == first
4099 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4104 /* at this point we know whatever we have is a
4105 * NOTHING sequence/branch AND if 'startbranch'
4106 * is 'first' then we can turn the whole thing
4109 if ( startbranch == first ) {
4111 /* the entire thing is a NOTHING sequence,
4112 * something like this: (?:|) So we can
4113 * turn it into a plain NOTHING op. */
4114 DEBUG_TRIE_COMPILE_r({
4115 regprop(RExC_rx, mysv, cur, NULL);
4116 PerlIO_printf( Perl_debug_log,
4117 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4118 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4121 OP(startbranch)= NOTHING;
4122 NEXT_OFF(startbranch)= tail - startbranch;
4123 for ( opt= startbranch + 1; opt < tail ; opt++ )
4127 } /* end if ( last) */
4128 } /* TRIE_MAXBUF is non zero */
4133 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4134 scan = NEXTOPER(NEXTOPER(scan));
4135 } else /* single branch is optimized. */
4136 scan = NEXTOPER(scan);
4138 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4139 scan_frame *newframe = NULL;
4143 U32 my_recursed_depth= recursed_depth;
4145 if (OP(scan) != SUSPEND) {
4146 /* set the pointer */
4147 if (OP(scan) == GOSUB) {
4149 RExC_recurse[ARG2L(scan)] = scan;
4150 start = RExC_open_parens[paren-1];
4151 end = RExC_close_parens[paren-1];
4154 start = RExC_rxi->program + 1;
4159 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4161 if (!recursed_depth) {
4162 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4164 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4165 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4166 RExC_study_chunk_recursed_bytes, U8);
4168 /* we havent recursed into this paren yet, so recurse into it */
4169 DEBUG_STUDYDATA("set:", data,depth);
4170 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4171 my_recursed_depth= recursed_depth + 1;
4172 Newx(newframe,1,scan_frame);
4174 DEBUG_STUDYDATA("inf:", data,depth);
4175 /* some form of infinite recursion, assume infinite length
4177 if (flags & SCF_DO_SUBSTR) {
4178 scan_commit(pRExC_state, data, minlenp, is_inf);
4179 data->longest = &(data->longest_float);
4181 is_inf = is_inf_internal = 1;
4182 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4183 ssc_anything(data->start_class);
4184 flags &= ~SCF_DO_STCLASS;
4187 Newx(newframe,1,scan_frame);
4190 end = regnext(scan);
4195 SAVEFREEPV(newframe);
4196 newframe->next = regnext(scan);
4197 newframe->last = last;
4198 newframe->stop = stopparen;
4199 newframe->prev = frame;
4200 newframe->prev_recursed_depth = recursed_depth;
4202 DEBUG_STUDYDATA("frame-new:",data,depth);
4203 DEBUG_PEEP("fnew", scan, depth);
4210 recursed_depth= my_recursed_depth;
4215 else if (OP(scan) == EXACT) {
4216 SSize_t l = STR_LEN(scan);
4219 const U8 * const s = (U8*)STRING(scan);
4220 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4221 l = utf8_length(s, s + l);
4223 uc = *((U8*)STRING(scan));
4226 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4227 /* The code below prefers earlier match for fixed
4228 offset, later match for variable offset. */
4229 if (data->last_end == -1) { /* Update the start info. */
4230 data->last_start_min = data->pos_min;
4231 data->last_start_max = is_inf
4232 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4234 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4236 SvUTF8_on(data->last_found);
4238 SV * const sv = data->last_found;
4239 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4240 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4241 if (mg && mg->mg_len >= 0)
4242 mg->mg_len += utf8_length((U8*)STRING(scan),
4243 (U8*)STRING(scan)+STR_LEN(scan));
4245 data->last_end = data->pos_min + l;
4246 data->pos_min += l; /* As in the first entry. */
4247 data->flags &= ~SF_BEFORE_EOL;
4250 /* ANDing the code point leaves at most it, and not in locale, and
4251 * can't match null string */
4252 if (flags & SCF_DO_STCLASS_AND) {
4253 ssc_cp_and(data->start_class, uc);
4254 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4255 ssc_clear_locale(data->start_class);
4257 else if (flags & SCF_DO_STCLASS_OR) {
4258 ssc_add_cp(data->start_class, uc);
4259 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4261 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4262 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4264 flags &= ~SCF_DO_STCLASS;
4266 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4268 SSize_t l = STR_LEN(scan);
4269 UV uc = *((U8*)STRING(scan));
4270 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4271 separate code points */
4272 const U8 * s = (U8*)STRING(scan);
4274 /* Search for fixed substrings supports EXACT only. */
4275 if (flags & SCF_DO_SUBSTR) {
4277 scan_commit(pRExC_state, data, minlenp, is_inf);
4280 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4281 l = utf8_length(s, s + l);
4283 if (unfolded_multi_char) {
4284 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4286 min += l - min_subtract;
4288 delta += min_subtract;
4289 if (flags & SCF_DO_SUBSTR) {
4290 data->pos_min += l - min_subtract;
4291 if (data->pos_min < 0) {
4294 data->pos_delta += min_subtract;
4296 data->longest = &(data->longest_float);
4300 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4301 ssc_clear_locale(data->start_class);
4306 /* We punt and assume can match anything if the node begins
4307 * with a multi-character fold. Things are complicated. For
4308 * example, /ffi/i could match any of:
4309 * "\N{LATIN SMALL LIGATURE FFI}"
4310 * "\N{LATIN SMALL LIGATURE FF}I"
4311 * "F\N{LATIN SMALL LIGATURE FI}"
4312 * plus several other things; and making sure we have all the
4313 * possibilities is hard. */
4314 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4316 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4320 /* Any Latin1 range character can potentially match any
4321 * other depending on the locale */
4322 if (OP(scan) == EXACTFL) {
4323 _invlist_union(EXACTF_invlist, PL_Latin1,
4327 /* But otherwise, it matches at least itself. We can
4328 * quickly tell if it has a distinct fold, and if so,
4329 * it matches that as well */
4330 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4331 if (IS_IN_SOME_FOLD_L1(uc)) {
4332 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4333 PL_fold_latin1[uc]);
4337 /* Some characters match above-Latin1 ones under /i. This
4338 * is true of EXACTFL ones when the locale is UTF-8 */
4339 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4340 && (! isASCII(uc) || (OP(scan) != EXACTFA
4341 && OP(scan) != EXACTFA_NO_TRIE)))
4343 add_above_Latin1_folds(pRExC_state,
4349 else { /* Pattern is UTF-8 */
4350 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4351 STRLEN foldlen = UTF8SKIP(s);
4352 const U8* e = s + STR_LEN(scan);
4355 /* The only code points that aren't folded in a UTF EXACTFish
4356 * node are are the problematic ones in EXACTFL nodes */
4357 if (OP(scan) == EXACTFL
4358 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4360 /* We need to check for the possibility that this EXACTFL
4361 * node begins with a multi-char fold. Therefore we fold
4362 * the first few characters of it so that we can make that
4367 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4369 *(d++) = (U8) toFOLD(*s);
4374 to_utf8_fold(s, d, &len);
4380 /* And set up so the code below that looks in this folded
4381 * buffer instead of the node's string */
4383 foldlen = UTF8SKIP(folded);
4387 /* When we reach here 's' points to the fold of the first
4388 * character(s) of the node; and 'e' points to far enough along
4389 * the folded string to be just past any possible multi-char
4390 * fold. 'foldlen' is the length in bytes of the first
4393 * Unlike the non-UTF-8 case, the macro for determining if a
4394 * string is a multi-char fold requires all the characters to
4395 * already be folded. This is because of all the complications
4396 * if not. Note that they are folded anyway, except in EXACTFL
4397 * nodes. Like the non-UTF case above, we punt if the node
4398 * begins with a multi-char fold */
4400 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4402 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4404 else { /* Single char fold */
4406 /* It matches all the things that fold to it, which are
4407 * found in PL_utf8_foldclosures (including itself) */
4408 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4409 if (! PL_utf8_foldclosures) {
4410 _load_PL_utf8_foldclosures();
4412 if ((listp = hv_fetch(PL_utf8_foldclosures,
4413 (char *) s, foldlen, FALSE)))
4415 AV* list = (AV*) *listp;
4417 for (k = 0; k <= av_tindex(list); k++) {
4418 SV** c_p = av_fetch(list, k, FALSE);
4424 /* /aa doesn't allow folds between ASCII and non- */
4425 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4426 && isASCII(c) != isASCII(uc))
4431 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4436 if (flags & SCF_DO_STCLASS_AND) {
4437 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4438 ANYOF_POSIXL_ZERO(data->start_class);
4439 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4441 else if (flags & SCF_DO_STCLASS_OR) {
4442 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4443 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4445 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4446 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4448 flags &= ~SCF_DO_STCLASS;
4449 SvREFCNT_dec(EXACTF_invlist);
4451 else if (REGNODE_VARIES(OP(scan))) {
4452 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4453 I32 fl = 0, f = flags;
4454 regnode * const oscan = scan;
4455 regnode_ssc this_class;
4456 regnode_ssc *oclass = NULL;
4457 I32 next_is_eval = 0;
4459 switch (PL_regkind[OP(scan)]) {
4460 case WHILEM: /* End of (?:...)* . */
4461 scan = NEXTOPER(scan);
4464 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4465 next = NEXTOPER(scan);
4466 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4468 maxcount = REG_INFTY;
4469 next = regnext(scan);
4470 scan = NEXTOPER(scan);
4474 if (flags & SCF_DO_SUBSTR)
4479 if (flags & SCF_DO_STCLASS) {
4481 maxcount = REG_INFTY;
4482 next = regnext(scan);
4483 scan = NEXTOPER(scan);
4486 if (flags & SCF_DO_SUBSTR) {
4487 scan_commit(pRExC_state, data, minlenp, is_inf);
4488 /* Cannot extend fixed substrings */
4489 data->longest = &(data->longest_float);
4491 is_inf = is_inf_internal = 1;
4492 scan = regnext(scan);
4493 goto optimize_curly_tail;
4495 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4496 && (scan->flags == stopparen))
4501 mincount = ARG1(scan);
4502 maxcount = ARG2(scan);
4504 next = regnext(scan);
4505 if (OP(scan) == CURLYX) {
4506 I32 lp = (data ? *(data->last_closep) : 0);
4507 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4509 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4510 next_is_eval = (OP(scan) == EVAL);
4512 if (flags & SCF_DO_SUBSTR) {
4514 scan_commit(pRExC_state, data, minlenp, is_inf);
4515 /* Cannot extend fixed substrings */
4516 pos_before = data->pos_min;
4520 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4522 data->flags |= SF_IS_INF;
4524 if (flags & SCF_DO_STCLASS) {
4525 ssc_init(pRExC_state, &this_class);
4526 oclass = data->start_class;
4527 data->start_class = &this_class;
4528 f |= SCF_DO_STCLASS_AND;
4529 f &= ~SCF_DO_STCLASS_OR;
4531 /* Exclude from super-linear cache processing any {n,m}
4532 regops for which the combination of input pos and regex
4533 pos is not enough information to determine if a match
4536 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4537 regex pos at the \s*, the prospects for a match depend not
4538 only on the input position but also on how many (bar\s*)
4539 repeats into the {4,8} we are. */
4540 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4541 f &= ~SCF_WHILEM_VISITED_POS;
4543 /* This will finish on WHILEM, setting scan, or on NULL: */
4544 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4545 last, data, stopparen, recursed_depth, NULL,
4547 ? (f & ~SCF_DO_SUBSTR)
4551 if (flags & SCF_DO_STCLASS)
4552 data->start_class = oclass;
4553 if (mincount == 0 || minnext == 0) {
4554 if (flags & SCF_DO_STCLASS_OR) {
4555 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4557 else if (flags & SCF_DO_STCLASS_AND) {
4558 /* Switch to OR mode: cache the old value of
4559 * data->start_class */
4561 StructCopy(data->start_class, and_withp, regnode_ssc);
4562 flags &= ~SCF_DO_STCLASS_AND;
4563 StructCopy(&this_class, data->start_class, regnode_ssc);
4564 flags |= SCF_DO_STCLASS_OR;
4565 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4567 } else { /* Non-zero len */
4568 if (flags & SCF_DO_STCLASS_OR) {
4569 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4570 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4572 else if (flags & SCF_DO_STCLASS_AND)
4573 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4574 flags &= ~SCF_DO_STCLASS;
4576 if (!scan) /* It was not CURLYX, but CURLY. */
4578 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4579 /* ? quantifier ok, except for (?{ ... }) */
4580 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4581 && (minnext == 0) && (deltanext == 0)
4582 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4583 && maxcount <= REG_INFTY/3) /* Complement check for big
4586 /* Fatal warnings may leak the regexp without this: */
4587 SAVEFREESV(RExC_rx_sv);
4588 ckWARNreg(RExC_parse,
4589 "Quantifier unexpected on zero-length expression");
4590 (void)ReREFCNT_inc(RExC_rx_sv);
4593 min += minnext * mincount;
4594 is_inf_internal |= deltanext == SSize_t_MAX
4595 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4596 is_inf |= is_inf_internal;
4598 delta = SSize_t_MAX;
4600 delta += (minnext + deltanext) * maxcount
4601 - minnext * mincount;
4603 /* Try powerful optimization CURLYX => CURLYN. */
4604 if ( OP(oscan) == CURLYX && data
4605 && data->flags & SF_IN_PAR
4606 && !(data->flags & SF_HAS_EVAL)
4607 && !deltanext && minnext == 1 ) {
4608 /* Try to optimize to CURLYN. */
4609 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4610 regnode * const nxt1 = nxt;
4617 if (!REGNODE_SIMPLE(OP(nxt))
4618 && !(PL_regkind[OP(nxt)] == EXACT
4619 && STR_LEN(nxt) == 1))
4625 if (OP(nxt) != CLOSE)
4627 if (RExC_open_parens) {
4628 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4629 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4631 /* Now we know that nxt2 is the only contents: */
4632 oscan->flags = (U8)ARG(nxt);
4634 OP(nxt1) = NOTHING; /* was OPEN. */
4637 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4638 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4639 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4640 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4641 OP(nxt + 1) = OPTIMIZED; /* was count. */
4642 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4647 /* Try optimization CURLYX => CURLYM. */
4648 if ( OP(oscan) == CURLYX && data
4649 && !(data->flags & SF_HAS_PAR)
4650 && !(data->flags & SF_HAS_EVAL)
4651 && !deltanext /* atom is fixed width */
4652 && minnext != 0 /* CURLYM can't handle zero width */
4654 /* Nor characters whose fold at run-time may be
4655 * multi-character */
4656 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4658 /* XXXX How to optimize if data == 0? */
4659 /* Optimize to a simpler form. */
4660 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4664 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4665 && (OP(nxt2) != WHILEM))
4667 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4668 /* Need to optimize away parenths. */
4669 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4670 /* Set the parenth number. */
4671 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4673 oscan->flags = (U8)ARG(nxt);
4674 if (RExC_open_parens) {
4675 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4676 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4678 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4679 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4682 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4683 OP(nxt + 1) = OPTIMIZED; /* was count. */
4684 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4685 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4688 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4689 regnode *nnxt = regnext(nxt1);
4691 if (reg_off_by_arg[OP(nxt1)])
4692 ARG_SET(nxt1, nxt2 - nxt1);
4693 else if (nxt2 - nxt1 < U16_MAX)
4694 NEXT_OFF(nxt1) = nxt2 - nxt1;
4696 OP(nxt) = NOTHING; /* Cannot beautify */
4701 /* Optimize again: */
4702 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4703 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4708 else if ((OP(oscan) == CURLYX)
4709 && (flags & SCF_WHILEM_VISITED_POS)
4710 /* See the comment on a similar expression above.
4711 However, this time it's not a subexpression
4712 we care about, but the expression itself. */
4713 && (maxcount == REG_INFTY)
4714 && data && ++data->whilem_c < 16) {
4715 /* This stays as CURLYX, we can put the count/of pair. */
4716 /* Find WHILEM (as in regexec.c) */
4717 regnode *nxt = oscan + NEXT_OFF(oscan);
4719 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4721 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4722 | (RExC_whilem_seen << 4)); /* On WHILEM */
4724 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4726 if (flags & SCF_DO_SUBSTR) {
4727 SV *last_str = NULL;
4728 STRLEN last_chrs = 0;
4729 int counted = mincount != 0;
4731 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4733 SSize_t b = pos_before >= data->last_start_min
4734 ? pos_before : data->last_start_min;
4736 const char * const s = SvPV_const(data->last_found, l);
4737 SSize_t old = b - data->last_start_min;
4740 old = utf8_hop((U8*)s, old) - (U8*)s;
4742 /* Get the added string: */
4743 last_str = newSVpvn_utf8(s + old, l, UTF);
4744 last_chrs = UTF ? utf8_length((U8*)(s + old),
4745 (U8*)(s + old + l)) : l;
4746 if (deltanext == 0 && pos_before == b) {
4747 /* What was added is a constant string */
4750 SvGROW(last_str, (mincount * l) + 1);
4751 repeatcpy(SvPVX(last_str) + l,
4752 SvPVX_const(last_str), l,
4754 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4755 /* Add additional parts. */
4756 SvCUR_set(data->last_found,
4757 SvCUR(data->last_found) - l);
4758 sv_catsv(data->last_found, last_str);
4760 SV * sv = data->last_found;
4762 SvUTF8(sv) && SvMAGICAL(sv) ?
4763 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4764 if (mg && mg->mg_len >= 0)
4765 mg->mg_len += last_chrs * (mincount-1);
4767 last_chrs *= mincount;
4768 data->last_end += l * (mincount - 1);
4771 /* start offset must point into the last copy */
4772 data->last_start_min += minnext * (mincount - 1);
4773 data->last_start_max += is_inf ? SSize_t_MAX
4774 : (maxcount - 1) * (minnext + data->pos_delta);
4777 /* It is counted once already... */
4778 data->pos_min += minnext * (mincount - counted);
4780 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4781 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4782 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4783 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4785 if (deltanext != SSize_t_MAX)
4786 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4787 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4788 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4790 if (deltanext == SSize_t_MAX
4791 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4792 data->pos_delta = SSize_t_MAX;
4794 data->pos_delta += - counted * deltanext +
4795 (minnext + deltanext) * maxcount - minnext * mincount;
4796 if (mincount != maxcount) {
4797 /* Cannot extend fixed substrings found inside
4799 scan_commit(pRExC_state, data, minlenp, is_inf);
4800 if (mincount && last_str) {
4801 SV * const sv = data->last_found;
4802 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4803 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4807 sv_setsv(sv, last_str);
4808 data->last_end = data->pos_min;
4809 data->last_start_min = data->pos_min - last_chrs;
4810 data->last_start_max = is_inf
4812 : data->pos_min + data->pos_delta - last_chrs;
4814 data->longest = &(data->longest_float);
4816 SvREFCNT_dec(last_str);
4818 if (data && (fl & SF_HAS_EVAL))
4819 data->flags |= SF_HAS_EVAL;
4820 optimize_curly_tail:
4821 if (OP(oscan) != CURLYX) {
4822 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4824 NEXT_OFF(oscan) += NEXT_OFF(next);
4830 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4835 if (flags & SCF_DO_SUBSTR) {
4836 /* Cannot expect anything... */
4837 scan_commit(pRExC_state, data, minlenp, is_inf);
4838 data->longest = &(data->longest_float);
4840 is_inf = is_inf_internal = 1;
4841 if (flags & SCF_DO_STCLASS_OR) {
4842 if (OP(scan) == CLUMP) {
4843 /* Actually is any start char, but very few code points
4844 * aren't start characters */
4845 ssc_match_all_cp(data->start_class);
4848 ssc_anything(data->start_class);
4851 flags &= ~SCF_DO_STCLASS;
4855 else if (OP(scan) == LNBREAK) {
4856 if (flags & SCF_DO_STCLASS) {
4857 if (flags & SCF_DO_STCLASS_AND) {
4858 ssc_intersection(data->start_class,
4859 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4860 ssc_clear_locale(data->start_class);
4861 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4863 else if (flags & SCF_DO_STCLASS_OR) {
4864 ssc_union(data->start_class,
4865 PL_XPosix_ptrs[_CC_VERTSPACE],
4867 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4869 /* See commit msg for
4870 * 749e076fceedeb708a624933726e7989f2302f6a */
4871 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4873 flags &= ~SCF_DO_STCLASS;
4876 delta++; /* Because of the 2 char string cr-lf */
4877 if (flags & SCF_DO_SUBSTR) {
4878 /* Cannot expect anything... */
4879 scan_commit(pRExC_state, data, minlenp, is_inf);
4881 data->pos_delta += 1;
4882 data->longest = &(data->longest_float);
4885 else if (REGNODE_SIMPLE(OP(scan))) {
4887 if (flags & SCF_DO_SUBSTR) {
4888 scan_commit(pRExC_state, data, minlenp, is_inf);
4892 if (flags & SCF_DO_STCLASS) {
4894 SV* my_invlist = sv_2mortal(_new_invlist(0));
4897 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4898 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4900 /* Some of the logic below assumes that switching
4901 locale on will only add false positives. */
4906 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4911 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4912 ssc_match_all_cp(data->start_class);
4917 SV* REG_ANY_invlist = _new_invlist(2);
4918 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4920 if (flags & SCF_DO_STCLASS_OR) {
4921 ssc_union(data->start_class,
4923 TRUE /* TRUE => invert, hence all but \n
4927 else if (flags & SCF_DO_STCLASS_AND) {
4928 ssc_intersection(data->start_class,
4930 TRUE /* TRUE => invert */
4932 ssc_clear_locale(data->start_class);
4934 SvREFCNT_dec_NN(REG_ANY_invlist);
4939 if (flags & SCF_DO_STCLASS_AND)
4940 ssc_and(pRExC_state, data->start_class,
4941 (regnode_charclass *) scan);
4943 ssc_or(pRExC_state, data->start_class,
4944 (regnode_charclass *) scan);
4952 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4953 if (flags & SCF_DO_STCLASS_AND) {
4954 bool was_there = cBOOL(
4955 ANYOF_POSIXL_TEST(data->start_class,
4957 ANYOF_POSIXL_ZERO(data->start_class);
4958 if (was_there) { /* Do an AND */
4959 ANYOF_POSIXL_SET(data->start_class, namedclass);
4961 /* No individual code points can now match */
4962 data->start_class->invlist
4963 = sv_2mortal(_new_invlist(0));
4966 int complement = namedclass + ((invert) ? -1 : 1);
4968 assert(flags & SCF_DO_STCLASS_OR);
4970 /* If the complement of this class was already there,
4971 * the result is that they match all code points,
4972 * (\d + \D == everything). Remove the classes from
4973 * future consideration. Locale is not relevant in
4975 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4976 ssc_match_all_cp(data->start_class);
4977 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4978 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4980 else { /* The usual case; just add this class to the
4982 ANYOF_POSIXL_SET(data->start_class, namedclass);
4987 case NPOSIXA: /* For these, we always know the exact set of
4992 if (FLAGS(scan) == _CC_ASCII) {
4993 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4996 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4997 PL_XPosix_ptrs[_CC_ASCII],
5008 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5010 /* NPOSIXD matches all upper Latin1 code points unless the
5011 * target string being matched is UTF-8, which is
5012 * unknowable until match time. Since we are going to
5013 * invert, we want to get rid of all of them so that the
5014 * inversion will match all */
5015 if (OP(scan) == NPOSIXD) {
5016 _invlist_subtract(my_invlist, PL_UpperLatin1,
5022 if (flags & SCF_DO_STCLASS_AND) {
5023 ssc_intersection(data->start_class, my_invlist, invert);
5024 ssc_clear_locale(data->start_class);
5027 assert(flags & SCF_DO_STCLASS_OR);
5028 ssc_union(data->start_class, my_invlist, invert);
5031 if (flags & SCF_DO_STCLASS_OR)
5032 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5033 flags &= ~SCF_DO_STCLASS;
5036 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5037 data->flags |= (OP(scan) == MEOL
5040 scan_commit(pRExC_state, data, minlenp, is_inf);
5043 else if ( PL_regkind[OP(scan)] == BRANCHJ
5044 /* Lookbehind, or need to calculate parens/evals/stclass: */
5045 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5046 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5048 if ( OP(scan) == UNLESSM &&
5050 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5051 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5054 regnode *upto= regnext(scan);
5056 SV * const mysv_val=sv_newmortal();
5057 DEBUG_STUDYDATA("OPFAIL",data,depth);
5059 /*DEBUG_PARSE_MSG("opfail");*/
5060 regprop(RExC_rx, mysv_val, upto, NULL);
5061 PerlIO_printf(Perl_debug_log,
5062 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5063 SvPV_nolen_const(mysv_val),
5064 (IV)REG_NODE_NUM(upto),
5069 NEXT_OFF(scan) = upto - scan;
5070 for (opt= scan + 1; opt < upto ; opt++)
5071 OP(opt) = OPTIMIZED;
5075 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5076 || OP(scan) == UNLESSM )
5078 /* Negative Lookahead/lookbehind
5079 In this case we can't do fixed string optimisation.
5082 SSize_t deltanext, minnext, fake = 0;
5087 data_fake.flags = 0;
5089 data_fake.whilem_c = data->whilem_c;
5090 data_fake.last_closep = data->last_closep;
5093 data_fake.last_closep = &fake;
5094 data_fake.pos_delta = delta;
5095 if ( flags & SCF_DO_STCLASS && !scan->flags
5096 && OP(scan) == IFMATCH ) { /* Lookahead */
5097 ssc_init(pRExC_state, &intrnl);
5098 data_fake.start_class = &intrnl;
5099 f |= SCF_DO_STCLASS_AND;
5101 if (flags & SCF_WHILEM_VISITED_POS)
5102 f |= SCF_WHILEM_VISITED_POS;
5103 next = regnext(scan);
5104 nscan = NEXTOPER(NEXTOPER(scan));
5105 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5106 last, &data_fake, stopparen,
5107 recursed_depth, NULL, f, depth+1);
5110 FAIL("Variable length lookbehind not implemented");
5112 else if (minnext > (I32)U8_MAX) {
5113 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5116 scan->flags = (U8)minnext;
5119 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5121 if (data_fake.flags & SF_HAS_EVAL)
5122 data->flags |= SF_HAS_EVAL;
5123 data->whilem_c = data_fake.whilem_c;
5125 if (f & SCF_DO_STCLASS_AND) {
5126 if (flags & SCF_DO_STCLASS_OR) {
5127 /* OR before, AND after: ideally we would recurse with
5128 * data_fake to get the AND applied by study of the
5129 * remainder of the pattern, and then derecurse;
5130 * *** HACK *** for now just treat as "no information".
5131 * See [perl #56690].
5133 ssc_init(pRExC_state, data->start_class);
5135 /* AND before and after: combine and continue. These
5136 * assertions are zero-length, so can match an EMPTY
5138 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5139 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5143 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5145 /* Positive Lookahead/lookbehind
5146 In this case we can do fixed string optimisation,
5147 but we must be careful about it. Note in the case of
5148 lookbehind the positions will be offset by the minimum
5149 length of the pattern, something we won't know about
5150 until after the recurse.
5152 SSize_t deltanext, fake = 0;
5156 /* We use SAVEFREEPV so that when the full compile
5157 is finished perl will clean up the allocated
5158 minlens when it's all done. This way we don't
5159 have to worry about freeing them when we know
5160 they wont be used, which would be a pain.
5163 Newx( minnextp, 1, SSize_t );
5164 SAVEFREEPV(minnextp);
5167 StructCopy(data, &data_fake, scan_data_t);
5168 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5171 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5172 data_fake.last_found=newSVsv(data->last_found);
5176 data_fake.last_closep = &fake;
5177 data_fake.flags = 0;
5178 data_fake.pos_delta = delta;
5180 data_fake.flags |= SF_IS_INF;
5181 if ( flags & SCF_DO_STCLASS && !scan->flags
5182 && OP(scan) == IFMATCH ) { /* Lookahead */
5183 ssc_init(pRExC_state, &intrnl);
5184 data_fake.start_class = &intrnl;
5185 f |= SCF_DO_STCLASS_AND;
5187 if (flags & SCF_WHILEM_VISITED_POS)
5188 f |= SCF_WHILEM_VISITED_POS;
5189 next = regnext(scan);
5190 nscan = NEXTOPER(NEXTOPER(scan));
5192 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5193 &deltanext, last, &data_fake,
5194 stopparen, recursed_depth, NULL,
5198 FAIL("Variable length lookbehind not implemented");
5200 else if (*minnextp > (I32)U8_MAX) {
5201 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5204 scan->flags = (U8)*minnextp;
5209 if (f & SCF_DO_STCLASS_AND) {
5210 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5211 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
5214 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5216 if (data_fake.flags & SF_HAS_EVAL)
5217 data->flags |= SF_HAS_EVAL;
5218 data->whilem_c = data_fake.whilem_c;
5219 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5220 if (RExC_rx->minlen<*minnextp)
5221 RExC_rx->minlen=*minnextp;
5222 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5223 SvREFCNT_dec_NN(data_fake.last_found);
5225 if ( data_fake.minlen_fixed != minlenp )
5227 data->offset_fixed= data_fake.offset_fixed;
5228 data->minlen_fixed= data_fake.minlen_fixed;
5229 data->lookbehind_fixed+= scan->flags;
5231 if ( data_fake.minlen_float != minlenp )
5233 data->minlen_float= data_fake.minlen_float;
5234 data->offset_float_min=data_fake.offset_float_min;
5235 data->offset_float_max=data_fake.offset_float_max;
5236 data->lookbehind_float+= scan->flags;
5243 else if (OP(scan) == OPEN) {
5244 if (stopparen != (I32)ARG(scan))
5247 else if (OP(scan) == CLOSE) {
5248 if (stopparen == (I32)ARG(scan)) {
5251 if ((I32)ARG(scan) == is_par) {
5252 next = regnext(scan);
5254 if ( next && (OP(next) != WHILEM) && next < last)
5255 is_par = 0; /* Disable optimization */
5258 *(data->last_closep) = ARG(scan);
5260 else if (OP(scan) == EVAL) {
5262 data->flags |= SF_HAS_EVAL;
5264 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5265 if (flags & SCF_DO_SUBSTR) {
5266 scan_commit(pRExC_state, data, minlenp, is_inf);
5267 flags &= ~SCF_DO_SUBSTR;
5269 if (data && OP(scan)==ACCEPT) {
5270 data->flags |= SCF_SEEN_ACCEPT;
5275 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5277 if (flags & SCF_DO_SUBSTR) {
5278 scan_commit(pRExC_state, data, minlenp, is_inf);
5279 data->longest = &(data->longest_float);
5281 is_inf = is_inf_internal = 1;
5282 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5283 ssc_anything(data->start_class);
5284 flags &= ~SCF_DO_STCLASS;
5286 else if (OP(scan) == GPOS) {
5287 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5288 !(delta || is_inf || (data && data->pos_delta)))
5290 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5291 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5292 if (RExC_rx->gofs < (STRLEN)min)
5293 RExC_rx->gofs = min;
5295 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5299 #ifdef TRIE_STUDY_OPT
5300 #ifdef FULL_TRIE_STUDY
5301 else if (PL_regkind[OP(scan)] == TRIE) {
5302 /* NOTE - There is similar code to this block above for handling
5303 BRANCH nodes on the initial study. If you change stuff here
5305 regnode *trie_node= scan;
5306 regnode *tail= regnext(scan);
5307 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5308 SSize_t max1 = 0, min1 = SSize_t_MAX;
5311 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5312 /* Cannot merge strings after this. */
5313 scan_commit(pRExC_state, data, minlenp, is_inf);
5315 if (flags & SCF_DO_STCLASS)
5316 ssc_init_zero(pRExC_state, &accum);
5322 const regnode *nextbranch= NULL;
5325 for ( word=1 ; word <= trie->wordcount ; word++)
5327 SSize_t deltanext=0, minnext=0, f = 0, fake;
5328 regnode_ssc this_class;
5330 data_fake.flags = 0;
5332 data_fake.whilem_c = data->whilem_c;
5333 data_fake.last_closep = data->last_closep;
5336 data_fake.last_closep = &fake;
5337 data_fake.pos_delta = delta;
5338 if (flags & SCF_DO_STCLASS) {
5339 ssc_init(pRExC_state, &this_class);
5340 data_fake.start_class = &this_class;
5341 f = SCF_DO_STCLASS_AND;
5343 if (flags & SCF_WHILEM_VISITED_POS)
5344 f |= SCF_WHILEM_VISITED_POS;
5346 if (trie->jump[word]) {
5348 nextbranch = trie_node + trie->jump[0];
5349 scan= trie_node + trie->jump[word];
5350 /* We go from the jump point to the branch that follows
5351 it. Note this means we need the vestigal unused
5352 branches even though they arent otherwise used. */
5353 minnext = study_chunk(pRExC_state, &scan, minlenp,
5354 &deltanext, (regnode *)nextbranch, &data_fake,
5355 stopparen, recursed_depth, NULL, f,depth+1);
5357 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5358 nextbranch= regnext((regnode*)nextbranch);
5360 if (min1 > (SSize_t)(minnext + trie->minlen))
5361 min1 = minnext + trie->minlen;
5362 if (deltanext == SSize_t_MAX) {
5363 is_inf = is_inf_internal = 1;
5365 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5366 max1 = minnext + deltanext + trie->maxlen;
5368 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5370 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5371 if ( stopmin > min + min1)
5372 stopmin = min + min1;
5373 flags &= ~SCF_DO_SUBSTR;
5375 data->flags |= SCF_SEEN_ACCEPT;
5378 if (data_fake.flags & SF_HAS_EVAL)
5379 data->flags |= SF_HAS_EVAL;
5380 data->whilem_c = data_fake.whilem_c;
5382 if (flags & SCF_DO_STCLASS)
5383 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5386 if (flags & SCF_DO_SUBSTR) {
5387 data->pos_min += min1;
5388 data->pos_delta += max1 - min1;
5389 if (max1 != min1 || is_inf)
5390 data->longest = &(data->longest_float);
5393 delta += max1 - min1;
5394 if (flags & SCF_DO_STCLASS_OR) {
5395 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5397 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5398 flags &= ~SCF_DO_STCLASS;
5401 else if (flags & SCF_DO_STCLASS_AND) {
5403 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5404 flags &= ~SCF_DO_STCLASS;
5407 /* Switch to OR mode: cache the old value of
5408 * data->start_class */
5410 StructCopy(data->start_class, and_withp, regnode_ssc);
5411 flags &= ~SCF_DO_STCLASS_AND;
5412 StructCopy(&accum, data->start_class, regnode_ssc);
5413 flags |= SCF_DO_STCLASS_OR;
5420 else if (PL_regkind[OP(scan)] == TRIE) {
5421 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5424 min += trie->minlen;
5425 delta += (trie->maxlen - trie->minlen);
5426 flags &= ~SCF_DO_STCLASS; /* xxx */
5427 if (flags & SCF_DO_SUBSTR) {
5428 /* Cannot expect anything... */
5429 scan_commit(pRExC_state, data, minlenp, is_inf);
5430 data->pos_min += trie->minlen;
5431 data->pos_delta += (trie->maxlen - trie->minlen);
5432 if (trie->maxlen != trie->minlen)
5433 data->longest = &(data->longest_float);
5435 if (trie->jump) /* no more substrings -- for now /grr*/
5436 flags &= ~SCF_DO_SUBSTR;
5438 #endif /* old or new */
5439 #endif /* TRIE_STUDY_OPT */
5441 /* Else: zero-length, ignore. */
5442 scan = regnext(scan);
5444 /* If we are exiting a recursion we can unset its recursed bit
5445 * and allow ourselves to enter it again - no danger of an
5446 * infinite loop there.
5447 if (stopparen > -1 && recursed) {
5448 DEBUG_STUDYDATA("unset:", data,depth);
5449 PAREN_UNSET( recursed, stopparen);
5453 DEBUG_STUDYDATA("frame-end:",data,depth);
5454 DEBUG_PEEP("fend", scan, depth);
5455 /* restore previous context */
5458 stopparen = frame->stop;
5459 recursed_depth = frame->prev_recursed_depth;
5462 frame = frame->prev;
5463 goto fake_study_recurse;
5468 DEBUG_STUDYDATA("pre-fin:",data,depth);
5471 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5473 if (flags & SCF_DO_SUBSTR && is_inf)
5474 data->pos_delta = SSize_t_MAX - data->pos_min;
5475 if (is_par > (I32)U8_MAX)
5477 if (is_par && pars==1 && data) {
5478 data->flags |= SF_IN_PAR;
5479 data->flags &= ~SF_HAS_PAR;
5481 else if (pars && data) {
5482 data->flags |= SF_HAS_PAR;
5483 data->flags &= ~SF_IN_PAR;
5485 if (flags & SCF_DO_STCLASS_OR)
5486 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5487 if (flags & SCF_TRIE_RESTUDY)
5488 data->flags |= SCF_TRIE_RESTUDY;
5490 DEBUG_STUDYDATA("post-fin:",data,depth);
5493 SSize_t final_minlen= min < stopmin ? min : stopmin;
5495 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5496 RExC_maxlen = final_minlen + delta;
5498 return final_minlen;
5504 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5506 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5508 PERL_ARGS_ASSERT_ADD_DATA;
5510 Renewc(RExC_rxi->data,
5511 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5512 char, struct reg_data);
5514 Renew(RExC_rxi->data->what, count + n, U8);
5516 Newx(RExC_rxi->data->what, n, U8);
5517 RExC_rxi->data->count = count + n;
5518 Copy(s, RExC_rxi->data->what + count, n, U8);
5522 /*XXX: todo make this not included in a non debugging perl, but appears to be
5523 * used anyway there, in 'use re' */
5524 #ifndef PERL_IN_XSUB_RE
5526 Perl_reginitcolors(pTHX)
5528 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5530 char *t = savepv(s);
5534 t = strchr(t, '\t');
5540 PL_colors[i] = t = (char *)"";
5545 PL_colors[i++] = (char *)"";
5552 #ifdef TRIE_STUDY_OPT
5553 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5556 (data.flags & SCF_TRIE_RESTUDY) \
5564 #define CHECK_RESTUDY_GOTO_butfirst
5568 * pregcomp - compile a regular expression into internal code
5570 * Decides which engine's compiler to call based on the hint currently in
5574 #ifndef PERL_IN_XSUB_RE
5576 /* return the currently in-scope regex engine (or the default if none) */
5578 regexp_engine const *
5579 Perl_current_re_engine(pTHX)
5581 if (IN_PERL_COMPILETIME) {
5582 HV * const table = GvHV(PL_hintgv);
5585 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5586 return &PL_core_reg_engine;
5587 ptr = hv_fetchs(table, "regcomp", FALSE);
5588 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5589 return &PL_core_reg_engine;
5590 return INT2PTR(regexp_engine*,SvIV(*ptr));
5594 if (!PL_curcop->cop_hints_hash)
5595 return &PL_core_reg_engine;
5596 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5597 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5598 return &PL_core_reg_engine;
5599 return INT2PTR(regexp_engine*,SvIV(ptr));
5605 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5607 regexp_engine const *eng = current_re_engine();
5608 GET_RE_DEBUG_FLAGS_DECL;
5610 PERL_ARGS_ASSERT_PREGCOMP;
5612 /* Dispatch a request to compile a regexp to correct regexp engine. */
5614 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5617 return CALLREGCOMP_ENG(eng, pattern, flags);
5621 /* public(ish) entry point for the perl core's own regex compiling code.
5622 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5623 * pattern rather than a list of OPs, and uses the internal engine rather
5624 * than the current one */
5627 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5629 SV *pat = pattern; /* defeat constness! */
5630 PERL_ARGS_ASSERT_RE_COMPILE;
5631 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5632 #ifdef PERL_IN_XSUB_RE
5635 &PL_core_reg_engine,
5637 NULL, NULL, rx_flags, 0);
5641 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5642 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5643 * point to the realloced string and length.
5645 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5649 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5650 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5652 U8 *const src = (U8*)*pat_p;
5655 STRLEN s = 0, d = 0;
5657 GET_RE_DEBUG_FLAGS_DECL;
5659 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5660 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5662 Newx(dst, *plen_p * 2 + 1, U8);
5664 while (s < *plen_p) {
5665 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5668 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5669 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5671 if (n < num_code_blocks) {
5672 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5673 pRExC_state->code_blocks[n].start = d;
5674 assert(dst[d] == '(');
5677 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5678 pRExC_state->code_blocks[n].end = d;
5679 assert(dst[d] == ')');
5689 *pat_p = (char*) dst;
5691 RExC_orig_utf8 = RExC_utf8 = 1;
5696 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5697 * while recording any code block indices, and handling overloading,
5698 * nested qr// objects etc. If pat is null, it will allocate a new
5699 * string, or just return the first arg, if there's only one.
5701 * Returns the malloced/updated pat.
5702 * patternp and pat_count is the array of SVs to be concatted;
5703 * oplist is the optional list of ops that generated the SVs;
5704 * recompile_p is a pointer to a boolean that will be set if
5705 * the regex will need to be recompiled.
5706 * delim, if non-null is an SV that will be inserted between each element
5710 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5711 SV *pat, SV ** const patternp, int pat_count,
5712 OP *oplist, bool *recompile_p, SV *delim)
5716 bool use_delim = FALSE;
5717 bool alloced = FALSE;
5719 /* if we know we have at least two args, create an empty string,
5720 * then concatenate args to that. For no args, return an empty string */
5721 if (!pat && pat_count != 1) {
5727 for (svp = patternp; svp < patternp + pat_count; svp++) {
5730 STRLEN orig_patlen = 0;
5732 SV *msv = use_delim ? delim : *svp;
5733 if (!msv) msv = &PL_sv_undef;
5735 /* if we've got a delimiter, we go round the loop twice for each
5736 * svp slot (except the last), using the delimiter the second
5745 if (SvTYPE(msv) == SVt_PVAV) {
5746 /* we've encountered an interpolated array within
5747 * the pattern, e.g. /...@a..../. Expand the list of elements,
5748 * then recursively append elements.
5749 * The code in this block is based on S_pushav() */
5751 AV *const av = (AV*)msv;
5752 const SSize_t maxarg = AvFILL(av) + 1;
5756 assert(oplist->op_type == OP_PADAV
5757 || oplist->op_type == OP_RV2AV);
5758 oplist = OP_SIBLING(oplist);
5761 if (SvRMAGICAL(av)) {
5764 Newx(array, maxarg, SV*);
5766 for (i=0; i < maxarg; i++) {
5767 SV ** const svp = av_fetch(av, i, FALSE);
5768 array[i] = svp ? *svp : &PL_sv_undef;
5772 array = AvARRAY(av);
5774 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5775 array, maxarg, NULL, recompile_p,
5777 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5783 /* we make the assumption here that each op in the list of
5784 * op_siblings maps to one SV pushed onto the stack,
5785 * except for code blocks, with have both an OP_NULL and
5787 * This allows us to match up the list of SVs against the
5788 * list of OPs to find the next code block.
5790 * Note that PUSHMARK PADSV PADSV ..
5792 * PADRANGE PADSV PADSV ..
5793 * so the alignment still works. */
5796 if (oplist->op_type == OP_NULL
5797 && (oplist->op_flags & OPf_SPECIAL))
5799 assert(n < pRExC_state->num_code_blocks);
5800 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5801 pRExC_state->code_blocks[n].block = oplist;
5802 pRExC_state->code_blocks[n].src_regex = NULL;
5805 oplist = OP_SIBLING(oplist); /* skip CONST */
5808 oplist = OP_SIBLING(oplist);;
5811 /* apply magic and QR overloading to arg */
5814 if (SvROK(msv) && SvAMAGIC(msv)) {
5815 SV *sv = AMG_CALLunary(msv, regexp_amg);
5819 if (SvTYPE(sv) != SVt_REGEXP)
5820 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5825 /* try concatenation overload ... */
5826 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5827 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5830 /* overloading involved: all bets are off over literal
5831 * code. Pretend we haven't seen it */
5832 pRExC_state->num_code_blocks -= n;
5836 /* ... or failing that, try "" overload */
5837 while (SvAMAGIC(msv)
5838 && (sv = AMG_CALLunary(msv, string_amg))
5842 && SvRV(msv) == SvRV(sv))
5847 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5851 /* this is a partially unrolled
5852 * sv_catsv_nomg(pat, msv);
5853 * that allows us to adjust code block indices if
5856 char *dst = SvPV_force_nomg(pat, dlen);
5858 if (SvUTF8(msv) && !SvUTF8(pat)) {
5859 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5860 sv_setpvn(pat, dst, dlen);
5863 sv_catsv_nomg(pat, msv);
5870 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5873 /* extract any code blocks within any embedded qr//'s */
5874 if (rx && SvTYPE(rx) == SVt_REGEXP
5875 && RX_ENGINE((REGEXP*)rx)->op_comp)
5878 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5879 if (ri->num_code_blocks) {
5881 /* the presence of an embedded qr// with code means
5882 * we should always recompile: the text of the
5883 * qr// may not have changed, but it may be a
5884 * different closure than last time */
5886 Renew(pRExC_state->code_blocks,
5887 pRExC_state->num_code_blocks + ri->num_code_blocks,
5888 struct reg_code_block);
5889 pRExC_state->num_code_blocks += ri->num_code_blocks;
5891 for (i=0; i < ri->num_code_blocks; i++) {
5892 struct reg_code_block *src, *dst;
5893 STRLEN offset = orig_patlen
5894 + ReANY((REGEXP *)rx)->pre_prefix;
5895 assert(n < pRExC_state->num_code_blocks);
5896 src = &ri->code_blocks[i];
5897 dst = &pRExC_state->code_blocks[n];
5898 dst->start = src->start + offset;
5899 dst->end = src->end + offset;
5900 dst->block = src->block;
5901 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5910 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5919 /* see if there are any run-time code blocks in the pattern.
5920 * False positives are allowed */
5923 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5924 char *pat, STRLEN plen)
5929 PERL_UNUSED_CONTEXT;
5931 for (s = 0; s < plen; s++) {
5932 if (n < pRExC_state->num_code_blocks
5933 && s == pRExC_state->code_blocks[n].start)
5935 s = pRExC_state->code_blocks[n].end;
5939 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5941 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5943 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5950 /* Handle run-time code blocks. We will already have compiled any direct
5951 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5952 * copy of it, but with any literal code blocks blanked out and
5953 * appropriate chars escaped; then feed it into
5955 * eval "qr'modified_pattern'"
5959 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5963 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5965 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5966 * and merge them with any code blocks of the original regexp.
5968 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5969 * instead, just save the qr and return FALSE; this tells our caller that
5970 * the original pattern needs upgrading to utf8.
5974 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5975 char *pat, STRLEN plen)
5979 GET_RE_DEBUG_FLAGS_DECL;
5981 if (pRExC_state->runtime_code_qr) {
5982 /* this is the second time we've been called; this should
5983 * only happen if the main pattern got upgraded to utf8
5984 * during compilation; re-use the qr we compiled first time
5985 * round (which should be utf8 too)
5987 qr = pRExC_state->runtime_code_qr;
5988 pRExC_state->runtime_code_qr = NULL;
5989 assert(RExC_utf8 && SvUTF8(qr));
5995 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
5999 /* determine how many extra chars we need for ' and \ escaping */
6000 for (s = 0; s < plen; s++) {
6001 if (pat[s] == '\'' || pat[s] == '\\')
6005 Newx(newpat, newlen, char);
6007 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6009 for (s = 0; s < plen; s++) {
6010 if (n < pRExC_state->num_code_blocks
6011 && s == pRExC_state->code_blocks[n].start)
6013 /* blank out literal code block */
6014 assert(pat[s] == '(');
6015 while (s <= pRExC_state->code_blocks[n].end) {
6023 if (pat[s] == '\'' || pat[s] == '\\')
6028 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6032 PerlIO_printf(Perl_debug_log,
6033 "%sre-parsing pattern for runtime code:%s %s\n",
6034 PL_colors[4],PL_colors[5],newpat);
6037 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6043 PUSHSTACKi(PERLSI_REQUIRE);
6044 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6045 * parsing qr''; normally only q'' does this. It also alters
6047 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6048 SvREFCNT_dec_NN(sv);
6053 SV * const errsv = ERRSV;
6054 if (SvTRUE_NN(errsv))
6056 Safefree(pRExC_state->code_blocks);
6057 /* use croak_sv ? */
6058 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6061 assert(SvROK(qr_ref));
6063 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6064 /* the leaving below frees the tmp qr_ref.
6065 * Give qr a life of its own */
6073 if (!RExC_utf8 && SvUTF8(qr)) {
6074 /* first time through; the pattern got upgraded; save the
6075 * qr for the next time through */
6076 assert(!pRExC_state->runtime_code_qr);
6077 pRExC_state->runtime_code_qr = qr;
6082 /* extract any code blocks within the returned qr// */
6085 /* merge the main (r1) and run-time (r2) code blocks into one */
6087 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6088 struct reg_code_block *new_block, *dst;
6089 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6092 if (!r2->num_code_blocks) /* we guessed wrong */
6094 SvREFCNT_dec_NN(qr);
6099 r1->num_code_blocks + r2->num_code_blocks,
6100 struct reg_code_block);
6103 while ( i1 < r1->num_code_blocks
6104 || i2 < r2->num_code_blocks)
6106 struct reg_code_block *src;
6109 if (i1 == r1->num_code_blocks) {
6110 src = &r2->code_blocks[i2++];
6113 else if (i2 == r2->num_code_blocks)
6114 src = &r1->code_blocks[i1++];
6115 else if ( r1->code_blocks[i1].start
6116 < r2->code_blocks[i2].start)
6118 src = &r1->code_blocks[i1++];
6119 assert(src->end < r2->code_blocks[i2].start);
6122 assert( r1->code_blocks[i1].start
6123 > r2->code_blocks[i2].start);
6124 src = &r2->code_blocks[i2++];
6126 assert(src->end < r1->code_blocks[i1].start);
6129 assert(pat[src->start] == '(');
6130 assert(pat[src->end] == ')');
6131 dst->start = src->start;
6132 dst->end = src->end;
6133 dst->block = src->block;
6134 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6138 r1->num_code_blocks += r2->num_code_blocks;
6139 Safefree(r1->code_blocks);
6140 r1->code_blocks = new_block;
6143 SvREFCNT_dec_NN(qr);
6149 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6150 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6151 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6152 STRLEN longest_length, bool eol, bool meol)
6154 /* This is the common code for setting up the floating and fixed length
6155 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6156 * as to whether succeeded or not */
6161 if (! (longest_length
6162 || (eol /* Can't have SEOL and MULTI */
6163 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6165 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6166 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6171 /* copy the information about the longest from the reg_scan_data
6172 over to the program. */
6173 if (SvUTF8(sv_longest)) {
6174 *rx_utf8 = sv_longest;
6177 *rx_substr = sv_longest;
6180 /* end_shift is how many chars that must be matched that
6181 follow this item. We calculate it ahead of time as once the
6182 lookbehind offset is added in we lose the ability to correctly
6184 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6185 *rx_end_shift = ml - offset
6186 - longest_length + (SvTAIL(sv_longest) != 0)
6189 t = (eol/* Can't have SEOL and MULTI */
6190 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6191 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6197 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6198 * regular expression into internal code.
6199 * The pattern may be passed either as:
6200 * a list of SVs (patternp plus pat_count)
6201 * a list of OPs (expr)
6202 * If both are passed, the SV list is used, but the OP list indicates
6203 * which SVs are actually pre-compiled code blocks
6205 * The SVs in the list have magic and qr overloading applied to them (and
6206 * the list may be modified in-place with replacement SVs in the latter
6209 * If the pattern hasn't changed from old_re, then old_re will be
6212 * eng is the current engine. If that engine has an op_comp method, then
6213 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6214 * do the initial concatenation of arguments and pass on to the external
6217 * If is_bare_re is not null, set it to a boolean indicating whether the
6218 * arg list reduced (after overloading) to a single bare regex which has
6219 * been returned (i.e. /$qr/).
6221 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6223 * pm_flags contains the PMf_* flags, typically based on those from the
6224 * pm_flags field of the related PMOP. Currently we're only interested in
6225 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6227 * We can't allocate space until we know how big the compiled form will be,
6228 * but we can't compile it (and thus know how big it is) until we've got a
6229 * place to put the code. So we cheat: we compile it twice, once with code
6230 * generation turned off and size counting turned on, and once "for real".
6231 * This also means that we don't allocate space until we are sure that the
6232 * thing really will compile successfully, and we never have to move the
6233 * code and thus invalidate pointers into it. (Note that it has to be in
6234 * one piece because free() must be able to free it all.) [NB: not true in perl]
6236 * Beware that the optimization-preparation code in here knows about some
6237 * of the structure of the compiled regexp. [I'll say.]
6241 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6242 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6243 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6247 regexp_internal *ri;
6255 SV *code_blocksv = NULL;
6256 SV** new_patternp = patternp;
6258 /* these are all flags - maybe they should be turned
6259 * into a single int with different bit masks */
6260 I32 sawlookahead = 0;
6265 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6267 bool runtime_code = 0;
6269 RExC_state_t RExC_state;
6270 RExC_state_t * const pRExC_state = &RExC_state;
6271 #ifdef TRIE_STUDY_OPT
6273 RExC_state_t copyRExC_state;
6275 GET_RE_DEBUG_FLAGS_DECL;
6277 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6279 DEBUG_r(if (!PL_colorset) reginitcolors());
6281 #ifndef PERL_IN_XSUB_RE
6282 /* Initialize these here instead of as-needed, as is quick and avoids
6283 * having to test them each time otherwise */
6284 if (! PL_AboveLatin1) {
6285 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6286 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6287 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6288 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6289 PL_HasMultiCharFold =
6290 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6294 pRExC_state->code_blocks = NULL;
6295 pRExC_state->num_code_blocks = 0;
6298 *is_bare_re = FALSE;
6300 if (expr && (expr->op_type == OP_LIST ||
6301 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6302 /* allocate code_blocks if needed */
6306 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6307 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6308 ncode++; /* count of DO blocks */
6310 pRExC_state->num_code_blocks = ncode;
6311 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6316 /* compile-time pattern with just OP_CONSTs and DO blocks */
6321 /* find how many CONSTs there are */
6324 if (expr->op_type == OP_CONST)
6327 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6328 if (o->op_type == OP_CONST)
6332 /* fake up an SV array */
6334 assert(!new_patternp);
6335 Newx(new_patternp, n, SV*);
6336 SAVEFREEPV(new_patternp);
6340 if (expr->op_type == OP_CONST)
6341 new_patternp[n] = cSVOPx_sv(expr);
6343 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6344 if (o->op_type == OP_CONST)
6345 new_patternp[n++] = cSVOPo_sv;
6350 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6351 "Assembling pattern from %d elements%s\n", pat_count,
6352 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6354 /* set expr to the first arg op */
6356 if (pRExC_state->num_code_blocks
6357 && expr->op_type != OP_CONST)
6359 expr = cLISTOPx(expr)->op_first;
6360 assert( expr->op_type == OP_PUSHMARK
6361 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6362 || expr->op_type == OP_PADRANGE);
6363 expr = OP_SIBLING(expr);
6366 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6367 expr, &recompile, NULL);
6369 /* handle bare (possibly after overloading) regex: foo =~ $re */
6374 if (SvTYPE(re) == SVt_REGEXP) {
6378 Safefree(pRExC_state->code_blocks);
6379 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6380 "Precompiled pattern%s\n",
6381 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6387 exp = SvPV_nomg(pat, plen);
6389 if (!eng->op_comp) {
6390 if ((SvUTF8(pat) && IN_BYTES)
6391 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6393 /* make a temporary copy; either to convert to bytes,
6394 * or to avoid repeating get-magic / overloaded stringify */
6395 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6396 (IN_BYTES ? 0 : SvUTF8(pat)));
6398 Safefree(pRExC_state->code_blocks);
6399 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6402 /* ignore the utf8ness if the pattern is 0 length */
6403 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6404 RExC_uni_semantics = 0;
6405 RExC_contains_locale = 0;
6406 RExC_contains_i = 0;
6407 pRExC_state->runtime_code_qr = NULL;
6410 SV *dsv= sv_newmortal();
6411 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6412 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6413 PL_colors[4],PL_colors[5],s);
6417 /* we jump here if we upgrade the pattern to utf8 and have to
6420 if ((pm_flags & PMf_USE_RE_EVAL)
6421 /* this second condition covers the non-regex literal case,
6422 * i.e. $foo =~ '(?{})'. */
6423 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6425 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6427 /* return old regex if pattern hasn't changed */
6428 /* XXX: note in the below we have to check the flags as well as the
6431 * Things get a touch tricky as we have to compare the utf8 flag
6432 * independently from the compile flags. */
6436 && !!RX_UTF8(old_re) == !!RExC_utf8
6437 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6438 && RX_PRECOMP(old_re)
6439 && RX_PRELEN(old_re) == plen
6440 && memEQ(RX_PRECOMP(old_re), exp, plen)
6441 && !runtime_code /* with runtime code, always recompile */ )
6443 Safefree(pRExC_state->code_blocks);
6447 rx_flags = orig_rx_flags;
6449 if (rx_flags & PMf_FOLD) {
6450 RExC_contains_i = 1;
6452 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6454 /* Set to use unicode semantics if the pattern is in utf8 and has the
6455 * 'depends' charset specified, as it means unicode when utf8 */
6456 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6460 RExC_flags = rx_flags;
6461 RExC_pm_flags = pm_flags;
6464 if (TAINTING_get && TAINT_get)
6465 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6467 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6468 /* whoops, we have a non-utf8 pattern, whilst run-time code
6469 * got compiled as utf8. Try again with a utf8 pattern */
6470 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6471 pRExC_state->num_code_blocks);
6472 goto redo_first_pass;
6475 assert(!pRExC_state->runtime_code_qr);
6481 RExC_in_lookbehind = 0;
6482 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6484 RExC_override_recoding = 0;
6485 RExC_in_multi_char_class = 0;
6487 /* First pass: determine size, legality. */
6490 RExC_end = exp + plen;
6495 RExC_emit = (regnode *) &RExC_emit_dummy;
6496 RExC_whilem_seen = 0;
6497 RExC_open_parens = NULL;
6498 RExC_close_parens = NULL;
6500 RExC_paren_names = NULL;
6502 RExC_paren_name_list = NULL;
6504 RExC_recurse = NULL;
6505 RExC_study_chunk_recursed = NULL;
6506 RExC_study_chunk_recursed_bytes= 0;
6507 RExC_recurse_count = 0;
6508 pRExC_state->code_index = 0;
6510 #if 0 /* REGC() is (currently) a NOP at the first pass.
6511 * Clever compilers notice this and complain. --jhi */
6512 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6515 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6517 RExC_lastparse=NULL;
6519 /* reg may croak on us, not giving us a chance to free
6520 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6521 need it to survive as long as the regexp (qr/(?{})/).
6522 We must check that code_blocksv is not already set, because we may
6523 have jumped back to restart the sizing pass. */
6524 if (pRExC_state->code_blocks && !code_blocksv) {
6525 code_blocksv = newSV_type(SVt_PV);
6526 SAVEFREESV(code_blocksv);
6527 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6528 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6530 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6531 /* It's possible to write a regexp in ascii that represents Unicode
6532 codepoints outside of the byte range, such as via \x{100}. If we
6533 detect such a sequence we have to convert the entire pattern to utf8
6534 and then recompile, as our sizing calculation will have been based
6535 on 1 byte == 1 character, but we will need to use utf8 to encode
6536 at least some part of the pattern, and therefore must convert the whole
6539 if (flags & RESTART_UTF8) {
6540 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6541 pRExC_state->num_code_blocks);
6542 goto redo_first_pass;
6544 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6547 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6550 PerlIO_printf(Perl_debug_log,
6551 "Required size %"IVdf" nodes\n"
6552 "Starting second pass (creation)\n",
6555 RExC_lastparse=NULL;
6558 /* The first pass could have found things that force Unicode semantics */
6559 if ((RExC_utf8 || RExC_uni_semantics)
6560 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6562 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6565 /* Small enough for pointer-storage convention?
6566 If extralen==0, this means that we will not need long jumps. */
6567 if (RExC_size >= 0x10000L && RExC_extralen)
6568 RExC_size += RExC_extralen;
6571 if (RExC_whilem_seen > 15)
6572 RExC_whilem_seen = 15;
6574 /* Allocate space and zero-initialize. Note, the two step process
6575 of zeroing when in debug mode, thus anything assigned has to
6576 happen after that */
6577 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6579 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6580 char, regexp_internal);
6581 if ( r == NULL || ri == NULL )
6582 FAIL("Regexp out of space");
6584 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6585 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6588 /* bulk initialize base fields with 0. */
6589 Zero(ri, sizeof(regexp_internal), char);
6592 /* non-zero initialization begins here */
6595 r->extflags = rx_flags;
6596 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6598 if (pm_flags & PMf_IS_QR) {
6599 ri->code_blocks = pRExC_state->code_blocks;
6600 ri->num_code_blocks = pRExC_state->num_code_blocks;
6605 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6606 if (pRExC_state->code_blocks[n].src_regex)
6607 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6608 SAVEFREEPV(pRExC_state->code_blocks);
6612 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6613 bool has_charset = (get_regex_charset(r->extflags)
6614 != REGEX_DEPENDS_CHARSET);
6616 /* The caret is output if there are any defaults: if not all the STD
6617 * flags are set, or if no character set specifier is needed */
6619 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6621 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6622 == REG_RUN_ON_COMMENT_SEEN);
6623 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6624 >> RXf_PMf_STD_PMMOD_SHIFT);
6625 const char *fptr = STD_PAT_MODS; /*"msix"*/
6627 /* Allocate for the worst case, which is all the std flags are turned
6628 * on. If more precision is desired, we could do a population count of
6629 * the flags set. This could be done with a small lookup table, or by
6630 * shifting, masking and adding, or even, when available, assembly
6631 * language for a machine-language population count.
6632 * We never output a minus, as all those are defaults, so are
6633 * covered by the caret */
6634 const STRLEN wraplen = plen + has_p + has_runon
6635 + has_default /* If needs a caret */
6637 /* If needs a character set specifier */
6638 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6639 + (sizeof(STD_PAT_MODS) - 1)
6640 + (sizeof("(?:)") - 1);
6642 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6643 r->xpv_len_u.xpvlenu_pv = p;
6645 SvFLAGS(rx) |= SVf_UTF8;
6648 /* If a default, cover it using the caret */
6650 *p++= DEFAULT_PAT_MOD;
6654 const char* const name = get_regex_charset_name(r->extflags, &len);
6655 Copy(name, p, len, char);
6659 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6662 while((ch = *fptr++)) {
6670 Copy(RExC_precomp, p, plen, char);
6671 assert ((RX_WRAPPED(rx) - p) < 16);
6672 r->pre_prefix = p - RX_WRAPPED(rx);
6678 SvCUR_set(rx, p - RX_WRAPPED(rx));
6682 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6684 /* setup various meta data about recursion, this all requires
6685 * RExC_npar to be correctly set, and a bit later on we clear it */
6686 if (RExC_seen & REG_RECURSE_SEEN) {
6687 Newxz(RExC_open_parens, RExC_npar,regnode *);
6688 SAVEFREEPV(RExC_open_parens);
6689 Newxz(RExC_close_parens,RExC_npar,regnode *);
6690 SAVEFREEPV(RExC_close_parens);
6692 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6693 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6694 * So its 1 if there are no parens. */
6695 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6696 ((RExC_npar & 0x07) != 0);
6697 Newx(RExC_study_chunk_recursed,
6698 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6699 SAVEFREEPV(RExC_study_chunk_recursed);
6702 /* Useful during FAIL. */
6703 #ifdef RE_TRACK_PATTERN_OFFSETS
6704 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6705 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6706 "%s %"UVuf" bytes for offset annotations.\n",
6707 ri->u.offsets ? "Got" : "Couldn't get",
6708 (UV)((2*RExC_size+1) * sizeof(U32))));
6710 SetProgLen(ri,RExC_size);
6715 /* Second pass: emit code. */
6716 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6717 RExC_pm_flags = pm_flags;
6719 RExC_end = exp + plen;
6722 RExC_emit_start = ri->program;
6723 RExC_emit = ri->program;
6724 RExC_emit_bound = ri->program + RExC_size + 1;
6725 pRExC_state->code_index = 0;
6727 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6728 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6730 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6732 /* XXXX To minimize changes to RE engine we always allocate
6733 3-units-long substrs field. */
6734 Newx(r->substrs, 1, struct reg_substr_data);
6735 if (RExC_recurse_count) {
6736 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6737 SAVEFREEPV(RExC_recurse);
6741 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6742 Zero(r->substrs, 1, struct reg_substr_data);
6743 if (RExC_study_chunk_recursed)
6744 Zero(RExC_study_chunk_recursed,
6745 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6747 #ifdef TRIE_STUDY_OPT
6749 StructCopy(&zero_scan_data, &data, scan_data_t);
6750 copyRExC_state = RExC_state;
6753 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6755 RExC_state = copyRExC_state;
6756 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6757 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6759 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6760 StructCopy(&zero_scan_data, &data, scan_data_t);
6763 StructCopy(&zero_scan_data, &data, scan_data_t);
6766 /* Dig out information for optimizations. */
6767 r->extflags = RExC_flags; /* was pm_op */
6768 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6771 SvUTF8_on(rx); /* Unicode in it? */
6772 ri->regstclass = NULL;
6773 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6774 r->intflags |= PREGf_NAUGHTY;
6775 scan = ri->program + 1; /* First BRANCH. */
6777 /* testing for BRANCH here tells us whether there is "must appear"
6778 data in the pattern. If there is then we can use it for optimisations */
6779 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6782 STRLEN longest_float_length, longest_fixed_length;
6783 regnode_ssc ch_class; /* pointed to by data */
6785 SSize_t last_close = 0; /* pointed to by data */
6786 regnode *first= scan;
6787 regnode *first_next= regnext(first);
6789 * Skip introductions and multiplicators >= 1
6790 * so that we can extract the 'meat' of the pattern that must
6791 * match in the large if() sequence following.
6792 * NOTE that EXACT is NOT covered here, as it is normally
6793 * picked up by the optimiser separately.
6795 * This is unfortunate as the optimiser isnt handling lookahead
6796 * properly currently.
6799 while ((OP(first) == OPEN && (sawopen = 1)) ||
6800 /* An OR of *one* alternative - should not happen now. */
6801 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6802 /* for now we can't handle lookbehind IFMATCH*/
6803 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6804 (OP(first) == PLUS) ||
6805 (OP(first) == MINMOD) ||
6806 /* An {n,m} with n>0 */
6807 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6808 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6811 * the only op that could be a regnode is PLUS, all the rest
6812 * will be regnode_1 or regnode_2.
6814 * (yves doesn't think this is true)
6816 if (OP(first) == PLUS)
6819 if (OP(first) == MINMOD)
6821 first += regarglen[OP(first)];
6823 first = NEXTOPER(first);
6824 first_next= regnext(first);
6827 /* Starting-point info. */
6829 DEBUG_PEEP("first:",first,0);
6830 /* Ignore EXACT as we deal with it later. */
6831 if (PL_regkind[OP(first)] == EXACT) {
6832 if (OP(first) == EXACT)
6833 NOOP; /* Empty, get anchored substr later. */
6835 ri->regstclass = first;
6838 else if (PL_regkind[OP(first)] == TRIE &&
6839 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6841 /* this can happen only on restudy */
6842 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6845 else if (REGNODE_SIMPLE(OP(first)))
6846 ri->regstclass = first;
6847 else if (PL_regkind[OP(first)] == BOUND ||
6848 PL_regkind[OP(first)] == NBOUND)
6849 ri->regstclass = first;
6850 else if (PL_regkind[OP(first)] == BOL) {
6851 r->intflags |= (OP(first) == MBOL
6853 : (OP(first) == SBOL
6856 first = NEXTOPER(first);
6859 else if (OP(first) == GPOS) {
6860 r->intflags |= PREGf_ANCH_GPOS;
6861 first = NEXTOPER(first);
6864 else if ((!sawopen || !RExC_sawback) &&
6866 (OP(first) == STAR &&
6867 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6868 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6870 /* turn .* into ^.* with an implied $*=1 */
6872 (OP(NEXTOPER(first)) == REG_ANY)
6875 r->intflags |= (type | PREGf_IMPLICIT);
6876 first = NEXTOPER(first);
6879 if (sawplus && !sawminmod && !sawlookahead
6880 && (!sawopen || !RExC_sawback)
6881 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6882 /* x+ must match at the 1st pos of run of x's */
6883 r->intflags |= PREGf_SKIP;
6885 /* Scan is after the zeroth branch, first is atomic matcher. */
6886 #ifdef TRIE_STUDY_OPT
6889 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6890 (IV)(first - scan + 1))
6894 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6895 (IV)(first - scan + 1))
6901 * If there's something expensive in the r.e., find the
6902 * longest literal string that must appear and make it the
6903 * regmust. Resolve ties in favor of later strings, since
6904 * the regstart check works with the beginning of the r.e.
6905 * and avoiding duplication strengthens checking. Not a
6906 * strong reason, but sufficient in the absence of others.
6907 * [Now we resolve ties in favor of the earlier string if
6908 * it happens that c_offset_min has been invalidated, since the
6909 * earlier string may buy us something the later one won't.]
6912 data.longest_fixed = newSVpvs("");
6913 data.longest_float = newSVpvs("");
6914 data.last_found = newSVpvs("");
6915 data.longest = &(data.longest_fixed);
6916 ENTER_with_name("study_chunk");
6917 SAVEFREESV(data.longest_fixed);
6918 SAVEFREESV(data.longest_float);
6919 SAVEFREESV(data.last_found);
6921 if (!ri->regstclass) {
6922 ssc_init(pRExC_state, &ch_class);
6923 data.start_class = &ch_class;
6924 stclass_flag = SCF_DO_STCLASS_AND;
6925 } else /* XXXX Check for BOUND? */
6927 data.last_closep = &last_close;
6930 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6931 scan + RExC_size, /* Up to end */
6933 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6934 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6938 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6941 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6942 && data.last_start_min == 0 && data.last_end > 0
6943 && !RExC_seen_zerolen
6944 && !(RExC_seen & REG_VERBARG_SEEN)
6945 && !(RExC_seen & REG_GPOS_SEEN)
6947 r->extflags |= RXf_CHECK_ALL;
6949 scan_commit(pRExC_state, &data,&minlen,0);
6951 longest_float_length = CHR_SVLEN(data.longest_float);
6953 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6954 && data.offset_fixed == data.offset_float_min
6955 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6956 && S_setup_longest (aTHX_ pRExC_state,
6960 &(r->float_end_shift),
6961 data.lookbehind_float,
6962 data.offset_float_min,
6964 longest_float_length,
6965 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6966 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6968 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6969 r->float_max_offset = data.offset_float_max;
6970 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6971 r->float_max_offset -= data.lookbehind_float;
6972 SvREFCNT_inc_simple_void_NN(data.longest_float);
6975 r->float_substr = r->float_utf8 = NULL;
6976 longest_float_length = 0;
6979 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6981 if (S_setup_longest (aTHX_ pRExC_state,
6983 &(r->anchored_utf8),
6984 &(r->anchored_substr),
6985 &(r->anchored_end_shift),
6986 data.lookbehind_fixed,
6989 longest_fixed_length,
6990 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6991 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6993 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6994 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6997 r->anchored_substr = r->anchored_utf8 = NULL;
6998 longest_fixed_length = 0;
7000 LEAVE_with_name("study_chunk");
7003 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7004 ri->regstclass = NULL;
7006 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7008 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7009 && !ssc_is_anything(data.start_class))
7011 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7013 ssc_finalize(pRExC_state, data.start_class);
7015 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7016 StructCopy(data.start_class,
7017 (regnode_ssc*)RExC_rxi->data->data[n],
7019 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7020 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7021 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7022 regprop(r, sv, (regnode*)data.start_class, NULL);
7023 PerlIO_printf(Perl_debug_log,
7024 "synthetic stclass \"%s\".\n",
7025 SvPVX_const(sv));});
7026 data.start_class = NULL;
7029 /* A temporary algorithm prefers floated substr to fixed one to dig
7031 if (longest_fixed_length > longest_float_length) {
7032 r->substrs->check_ix = 0;
7033 r->check_end_shift = r->anchored_end_shift;
7034 r->check_substr = r->anchored_substr;
7035 r->check_utf8 = r->anchored_utf8;
7036 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7037 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7038 r->intflags |= PREGf_NOSCAN;
7041 r->substrs->check_ix = 1;
7042 r->check_end_shift = r->float_end_shift;
7043 r->check_substr = r->float_substr;
7044 r->check_utf8 = r->float_utf8;
7045 r->check_offset_min = r->float_min_offset;
7046 r->check_offset_max = r->float_max_offset;
7048 if ((r->check_substr || r->check_utf8) ) {
7049 r->extflags |= RXf_USE_INTUIT;
7050 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7051 r->extflags |= RXf_INTUIT_TAIL;
7053 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7055 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7056 if ( (STRLEN)minlen < longest_float_length )
7057 minlen= longest_float_length;
7058 if ( (STRLEN)minlen < longest_fixed_length )
7059 minlen= longest_fixed_length;
7063 /* Several toplevels. Best we can is to set minlen. */
7065 regnode_ssc ch_class;
7066 SSize_t last_close = 0;
7068 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7070 scan = ri->program + 1;
7071 ssc_init(pRExC_state, &ch_class);
7072 data.start_class = &ch_class;
7073 data.last_closep = &last_close;
7076 minlen = study_chunk(pRExC_state,
7077 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7078 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7079 ? SCF_TRIE_DOING_RESTUDY
7083 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7085 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7086 = r->float_substr = r->float_utf8 = NULL;
7088 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7089 && ! ssc_is_anything(data.start_class))
7091 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7093 ssc_finalize(pRExC_state, data.start_class);
7095 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7096 StructCopy(data.start_class,
7097 (regnode_ssc*)RExC_rxi->data->data[n],
7099 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7100 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7101 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7102 regprop(r, sv, (regnode*)data.start_class, NULL);
7103 PerlIO_printf(Perl_debug_log,
7104 "synthetic stclass \"%s\".\n",
7105 SvPVX_const(sv));});
7106 data.start_class = NULL;
7110 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7111 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7112 r->maxlen = REG_INFTY;
7115 r->maxlen = RExC_maxlen;
7118 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7119 the "real" pattern. */
7121 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7122 (IV)minlen, (IV)r->minlen, RExC_maxlen);
7124 r->minlenret = minlen;
7125 if (r->minlen < minlen)
7128 if (RExC_seen & REG_GPOS_SEEN)
7129 r->intflags |= PREGf_GPOS_SEEN;
7130 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7131 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7133 if (pRExC_state->num_code_blocks)
7134 r->extflags |= RXf_EVAL_SEEN;
7135 if (RExC_seen & REG_CANY_SEEN)
7136 r->intflags |= PREGf_CANY_SEEN;
7137 if (RExC_seen & REG_VERBARG_SEEN)
7139 r->intflags |= PREGf_VERBARG_SEEN;
7140 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7142 if (RExC_seen & REG_CUTGROUP_SEEN)
7143 r->intflags |= PREGf_CUTGROUP_SEEN;
7144 if (pm_flags & PMf_USE_RE_EVAL)
7145 r->intflags |= PREGf_USE_RE_EVAL;
7146 if (RExC_paren_names)
7147 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7149 RXp_PAREN_NAMES(r) = NULL;
7151 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7152 * so it can be used in pp.c */
7153 if (r->intflags & PREGf_ANCH)
7154 r->extflags |= RXf_IS_ANCHORED;
7158 /* this is used to identify "special" patterns that might result
7159 * in Perl NOT calling the regex engine and instead doing the match "itself",
7160 * particularly special cases in split//. By having the regex compiler
7161 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7162 * we avoid weird issues with equivalent patterns resulting in different behavior,
7163 * AND we allow non Perl engines to get the same optimizations by the setting the
7164 * flags appropriately - Yves */
7165 regnode *first = ri->program + 1;
7167 regnode *next = NEXTOPER(first);
7170 if (PL_regkind[fop] == NOTHING && nop == END)
7171 r->extflags |= RXf_NULL;
7172 else if (PL_regkind[fop] == BOL && nop == END)
7173 r->extflags |= RXf_START_ONLY;
7174 else if (fop == PLUS
7175 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7176 && OP(regnext(first)) == END)
7177 r->extflags |= RXf_WHITE;
7178 else if ( r->extflags & RXf_SPLIT
7180 && STR_LEN(first) == 1
7181 && *(STRING(first)) == ' '
7182 && OP(regnext(first)) == END )
7183 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7187 if (RExC_contains_locale) {
7188 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7192 if (RExC_paren_names) {
7193 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7194 ri->data->data[ri->name_list_idx]
7195 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7198 ri->name_list_idx = 0;
7200 if (RExC_recurse_count) {
7201 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7202 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7203 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7206 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7207 /* assume we don't need to swap parens around before we match */
7211 PerlIO_printf(Perl_debug_log,"Final program:\n");
7214 #ifdef RE_TRACK_PATTERN_OFFSETS
7215 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7216 const STRLEN len = ri->u.offsets[0];
7218 GET_RE_DEBUG_FLAGS_DECL;
7219 PerlIO_printf(Perl_debug_log,
7220 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7221 for (i = 1; i <= len; i++) {
7222 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7223 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7224 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7226 PerlIO_printf(Perl_debug_log, "\n");
7231 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7232 * by setting the regexp SV to readonly-only instead. If the
7233 * pattern's been recompiled, the USEDness should remain. */
7234 if (old_re && SvREADONLY(old_re))
7242 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7245 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7247 PERL_UNUSED_ARG(value);
7249 if (flags & RXapif_FETCH) {
7250 return reg_named_buff_fetch(rx, key, flags);
7251 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7252 Perl_croak_no_modify();
7254 } else if (flags & RXapif_EXISTS) {
7255 return reg_named_buff_exists(rx, key, flags)
7258 } else if (flags & RXapif_REGNAMES) {
7259 return reg_named_buff_all(rx, flags);
7260 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7261 return reg_named_buff_scalar(rx, flags);
7263 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7269 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7272 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7273 PERL_UNUSED_ARG(lastkey);
7275 if (flags & RXapif_FIRSTKEY)
7276 return reg_named_buff_firstkey(rx, flags);
7277 else if (flags & RXapif_NEXTKEY)
7278 return reg_named_buff_nextkey(rx, flags);
7280 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7287 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7290 AV *retarray = NULL;
7292 struct regexp *const rx = ReANY(r);
7294 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7296 if (flags & RXapif_ALL)
7299 if (rx && RXp_PAREN_NAMES(rx)) {
7300 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7303 SV* sv_dat=HeVAL(he_str);
7304 I32 *nums=(I32*)SvPVX(sv_dat);
7305 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7306 if ((I32)(rx->nparens) >= nums[i]
7307 && rx->offs[nums[i]].start != -1
7308 && rx->offs[nums[i]].end != -1)
7311 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7316 ret = newSVsv(&PL_sv_undef);
7319 av_push(retarray, ret);
7322 return newRV_noinc(MUTABLE_SV(retarray));
7329 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7332 struct regexp *const rx = ReANY(r);
7334 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7336 if (rx && RXp_PAREN_NAMES(rx)) {
7337 if (flags & RXapif_ALL) {
7338 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7340 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7342 SvREFCNT_dec_NN(sv);
7354 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7356 struct regexp *const rx = ReANY(r);
7358 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7360 if ( rx && RXp_PAREN_NAMES(rx) ) {
7361 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7363 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7370 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7372 struct regexp *const rx = ReANY(r);
7373 GET_RE_DEBUG_FLAGS_DECL;
7375 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7377 if (rx && RXp_PAREN_NAMES(rx)) {
7378 HV *hv = RXp_PAREN_NAMES(rx);
7380 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7383 SV* sv_dat = HeVAL(temphe);
7384 I32 *nums = (I32*)SvPVX(sv_dat);
7385 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7386 if ((I32)(rx->lastparen) >= nums[i] &&
7387 rx->offs[nums[i]].start != -1 &&
7388 rx->offs[nums[i]].end != -1)
7394 if (parno || flags & RXapif_ALL) {
7395 return newSVhek(HeKEY_hek(temphe));
7403 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7408 struct regexp *const rx = ReANY(r);
7410 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7412 if (rx && RXp_PAREN_NAMES(rx)) {
7413 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7414 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7415 } else if (flags & RXapif_ONE) {
7416 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7417 av = MUTABLE_AV(SvRV(ret));
7418 length = av_tindex(av);
7419 SvREFCNT_dec_NN(ret);
7420 return newSViv(length + 1);
7422 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7427 return &PL_sv_undef;
7431 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7433 struct regexp *const rx = ReANY(r);
7436 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7438 if (rx && RXp_PAREN_NAMES(rx)) {
7439 HV *hv= RXp_PAREN_NAMES(rx);
7441 (void)hv_iterinit(hv);
7442 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7445 SV* sv_dat = HeVAL(temphe);
7446 I32 *nums = (I32*)SvPVX(sv_dat);
7447 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7448 if ((I32)(rx->lastparen) >= nums[i] &&
7449 rx->offs[nums[i]].start != -1 &&
7450 rx->offs[nums[i]].end != -1)
7456 if (parno || flags & RXapif_ALL) {
7457 av_push(av, newSVhek(HeKEY_hek(temphe)));
7462 return newRV_noinc(MUTABLE_SV(av));
7466 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7469 struct regexp *const rx = ReANY(r);
7475 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7477 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7478 || n == RX_BUFF_IDX_CARET_FULLMATCH
7479 || n == RX_BUFF_IDX_CARET_POSTMATCH
7482 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7484 /* on something like
7487 * the KEEPCOPY is set on the PMOP rather than the regex */
7488 if (PL_curpm && r == PM_GETRE(PL_curpm))
7489 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7498 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7499 /* no need to distinguish between them any more */
7500 n = RX_BUFF_IDX_FULLMATCH;
7502 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7503 && rx->offs[0].start != -1)
7505 /* $`, ${^PREMATCH} */
7506 i = rx->offs[0].start;
7510 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7511 && rx->offs[0].end != -1)
7513 /* $', ${^POSTMATCH} */
7514 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7515 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7518 if ( 0 <= n && n <= (I32)rx->nparens &&
7519 (s1 = rx->offs[n].start) != -1 &&
7520 (t1 = rx->offs[n].end) != -1)
7522 /* $&, ${^MATCH}, $1 ... */
7524 s = rx->subbeg + s1 - rx->suboffset;
7529 assert(s >= rx->subbeg);
7530 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7532 #ifdef NO_TAINT_SUPPORT
7533 sv_setpvn(sv, s, i);
7535 const int oldtainted = TAINT_get;
7537 sv_setpvn(sv, s, i);
7538 TAINT_set(oldtainted);
7540 if ( (rx->intflags & PREGf_CANY_SEEN)
7541 ? (RXp_MATCH_UTF8(rx)
7542 && (!i || is_utf8_string((U8*)s, i)))
7543 : (RXp_MATCH_UTF8(rx)) )
7550 if (RXp_MATCH_TAINTED(rx)) {
7551 if (SvTYPE(sv) >= SVt_PVMG) {
7552 MAGIC* const mg = SvMAGIC(sv);
7555 SvMAGIC_set(sv, mg->mg_moremagic);
7557 if ((mgt = SvMAGIC(sv))) {
7558 mg->mg_moremagic = mgt;
7559 SvMAGIC_set(sv, mg);
7570 sv_setsv(sv,&PL_sv_undef);
7576 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7577 SV const * const value)
7579 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7581 PERL_UNUSED_ARG(rx);
7582 PERL_UNUSED_ARG(paren);
7583 PERL_UNUSED_ARG(value);
7586 Perl_croak_no_modify();
7590 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7593 struct regexp *const rx = ReANY(r);
7597 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7599 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7600 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7601 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7604 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7606 /* on something like
7609 * the KEEPCOPY is set on the PMOP rather than the regex */
7610 if (PL_curpm && r == PM_GETRE(PL_curpm))
7611 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7617 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7619 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7620 case RX_BUFF_IDX_PREMATCH: /* $` */
7621 if (rx->offs[0].start != -1) {
7622 i = rx->offs[0].start;
7631 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7632 case RX_BUFF_IDX_POSTMATCH: /* $' */
7633 if (rx->offs[0].end != -1) {
7634 i = rx->sublen - rx->offs[0].end;
7636 s1 = rx->offs[0].end;
7643 default: /* $& / ${^MATCH}, $1, $2, ... */
7644 if (paren <= (I32)rx->nparens &&
7645 (s1 = rx->offs[paren].start) != -1 &&
7646 (t1 = rx->offs[paren].end) != -1)
7652 if (ckWARN(WARN_UNINITIALIZED))
7653 report_uninit((const SV *)sv);
7658 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7659 const char * const s = rx->subbeg - rx->suboffset + s1;
7664 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7671 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7673 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7674 PERL_UNUSED_ARG(rx);
7678 return newSVpvs("Regexp");
7681 /* Scans the name of a named buffer from the pattern.
7682 * If flags is REG_RSN_RETURN_NULL returns null.
7683 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7684 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7685 * to the parsed name as looked up in the RExC_paren_names hash.
7686 * If there is an error throws a vFAIL().. type exception.
7689 #define REG_RSN_RETURN_NULL 0
7690 #define REG_RSN_RETURN_NAME 1
7691 #define REG_RSN_RETURN_DATA 2
7694 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7696 char *name_start = RExC_parse;
7698 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7700 assert (RExC_parse <= RExC_end);
7701 if (RExC_parse == RExC_end) NOOP;
7702 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7703 /* skip IDFIRST by using do...while */
7706 RExC_parse += UTF8SKIP(RExC_parse);
7707 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7711 } while (isWORDCHAR(*RExC_parse));
7713 RExC_parse++; /* so the <- from the vFAIL is after the offending
7715 vFAIL("Group name must start with a non-digit word character");
7719 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7720 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7721 if ( flags == REG_RSN_RETURN_NAME)
7723 else if (flags==REG_RSN_RETURN_DATA) {
7726 if ( ! sv_name ) /* should not happen*/
7727 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7728 if (RExC_paren_names)
7729 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7731 sv_dat = HeVAL(he_str);
7733 vFAIL("Reference to nonexistent named group");
7737 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7738 (unsigned long) flags);
7740 assert(0); /* NOT REACHED */
7745 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7746 int rem=(int)(RExC_end - RExC_parse); \
7755 if (RExC_lastparse!=RExC_parse) \
7756 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7759 iscut ? "..." : "<" \
7762 PerlIO_printf(Perl_debug_log,"%16s",""); \
7765 num = RExC_size + 1; \
7767 num=REG_NODE_NUM(RExC_emit); \
7768 if (RExC_lastnum!=num) \
7769 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7771 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7772 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7773 (int)((depth*2)), "", \
7777 RExC_lastparse=RExC_parse; \
7782 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7783 DEBUG_PARSE_MSG((funcname)); \
7784 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7786 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7787 DEBUG_PARSE_MSG((funcname)); \
7788 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7791 /* This section of code defines the inversion list object and its methods. The
7792 * interfaces are highly subject to change, so as much as possible is static to
7793 * this file. An inversion list is here implemented as a malloc'd C UV array
7794 * as an SVt_INVLIST scalar.
7796 * An inversion list for Unicode is an array of code points, sorted by ordinal
7797 * number. The zeroth element is the first code point in the list. The 1th
7798 * element is the first element beyond that not in the list. In other words,
7799 * the first range is
7800 * invlist[0]..(invlist[1]-1)
7801 * The other ranges follow. Thus every element whose index is divisible by two
7802 * marks the beginning of a range that is in the list, and every element not
7803 * divisible by two marks the beginning of a range not in the list. A single
7804 * element inversion list that contains the single code point N generally
7805 * consists of two elements
7808 * (The exception is when N is the highest representable value on the
7809 * machine, in which case the list containing just it would be a single
7810 * element, itself. By extension, if the last range in the list extends to
7811 * infinity, then the first element of that range will be in the inversion list
7812 * at a position that is divisible by two, and is the final element in the
7814 * Taking the complement (inverting) an inversion list is quite simple, if the
7815 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7816 * This implementation reserves an element at the beginning of each inversion
7817 * list to always contain 0; there is an additional flag in the header which
7818 * indicates if the list begins at the 0, or is offset to begin at the next
7821 * More about inversion lists can be found in "Unicode Demystified"
7822 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7823 * More will be coming when functionality is added later.
7825 * The inversion list data structure is currently implemented as an SV pointing
7826 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7827 * array of UV whose memory management is automatically handled by the existing
7828 * facilities for SV's.
7830 * Some of the methods should always be private to the implementation, and some
7831 * should eventually be made public */
7833 /* The header definitions are in F<inline_invlist.c> */
7835 PERL_STATIC_INLINE UV*
7836 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7838 /* Returns a pointer to the first element in the inversion list's array.
7839 * This is called upon initialization of an inversion list. Where the
7840 * array begins depends on whether the list has the code point U+0000 in it
7841 * or not. The other parameter tells it whether the code that follows this
7842 * call is about to put a 0 in the inversion list or not. The first
7843 * element is either the element reserved for 0, if TRUE, or the element
7844 * after it, if FALSE */
7846 bool* offset = get_invlist_offset_addr(invlist);
7847 UV* zero_addr = (UV *) SvPVX(invlist);
7849 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7852 assert(! _invlist_len(invlist));
7856 /* 1^1 = 0; 1^0 = 1 */
7857 *offset = 1 ^ will_have_0;
7858 return zero_addr + *offset;
7861 PERL_STATIC_INLINE UV*
7862 S_invlist_array(SV* const invlist)
7864 /* Returns the pointer to the inversion list's array. Every time the
7865 * length changes, this needs to be called in case malloc or realloc moved
7868 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7870 /* Must not be empty. If these fail, you probably didn't check for <len>
7871 * being non-zero before trying to get the array */
7872 assert(_invlist_len(invlist));
7874 /* The very first element always contains zero, The array begins either
7875 * there, or if the inversion list is offset, at the element after it.
7876 * The offset header field determines which; it contains 0 or 1 to indicate
7877 * how much additionally to add */
7878 assert(0 == *(SvPVX(invlist)));
7879 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7882 PERL_STATIC_INLINE void
7883 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7885 /* Sets the current number of elements stored in the inversion list.
7886 * Updates SvCUR correspondingly */
7887 PERL_UNUSED_CONTEXT;
7888 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7890 assert(SvTYPE(invlist) == SVt_INVLIST);
7895 : TO_INTERNAL_SIZE(len + offset));
7896 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7899 PERL_STATIC_INLINE IV*
7900 S_get_invlist_previous_index_addr(SV* invlist)
7902 /* Return the address of the IV that is reserved to hold the cached index
7904 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7906 assert(SvTYPE(invlist) == SVt_INVLIST);
7908 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7911 PERL_STATIC_INLINE IV
7912 S_invlist_previous_index(SV* const invlist)
7914 /* Returns cached index of previous search */
7916 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7918 return *get_invlist_previous_index_addr(invlist);
7921 PERL_STATIC_INLINE void
7922 S_invlist_set_previous_index(SV* const invlist, const IV index)
7924 /* Caches <index> for later retrieval */
7926 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7928 assert(index == 0 || index < (int) _invlist_len(invlist));
7930 *get_invlist_previous_index_addr(invlist) = index;
7933 PERL_STATIC_INLINE UV
7934 S_invlist_max(SV* const invlist)
7936 /* Returns the maximum number of elements storable in the inversion list's
7937 * array, without having to realloc() */
7939 PERL_ARGS_ASSERT_INVLIST_MAX;
7941 assert(SvTYPE(invlist) == SVt_INVLIST);
7943 /* Assumes worst case, in which the 0 element is not counted in the
7944 * inversion list, so subtracts 1 for that */
7945 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7946 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7947 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7950 #ifndef PERL_IN_XSUB_RE
7952 Perl__new_invlist(pTHX_ IV initial_size)
7955 /* Return a pointer to a newly constructed inversion list, with enough
7956 * space to store 'initial_size' elements. If that number is negative, a
7957 * system default is used instead */
7961 if (initial_size < 0) {
7965 /* Allocate the initial space */
7966 new_list = newSV_type(SVt_INVLIST);
7968 /* First 1 is in case the zero element isn't in the list; second 1 is for
7970 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7971 invlist_set_len(new_list, 0, 0);
7973 /* Force iterinit() to be used to get iteration to work */
7974 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7976 *get_invlist_previous_index_addr(new_list) = 0;
7982 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7984 /* Return a pointer to a newly constructed inversion list, initialized to
7985 * point to <list>, which has to be in the exact correct inversion list
7986 * form, including internal fields. Thus this is a dangerous routine that
7987 * should not be used in the wrong hands. The passed in 'list' contains
7988 * several header fields at the beginning that are not part of the
7989 * inversion list body proper */
7991 const STRLEN length = (STRLEN) list[0];
7992 const UV version_id = list[1];
7993 const bool offset = cBOOL(list[2]);
7994 #define HEADER_LENGTH 3
7995 /* If any of the above changes in any way, you must change HEADER_LENGTH
7996 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7997 * perl -E 'say int(rand 2**31-1)'
7999 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8000 data structure type, so that one being
8001 passed in can be validated to be an
8002 inversion list of the correct vintage.
8005 SV* invlist = newSV_type(SVt_INVLIST);
8007 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8009 if (version_id != INVLIST_VERSION_ID) {
8010 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8013 /* The generated array passed in includes header elements that aren't part
8014 * of the list proper, so start it just after them */
8015 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8017 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8018 shouldn't touch it */
8020 *(get_invlist_offset_addr(invlist)) = offset;
8022 /* The 'length' passed to us is the physical number of elements in the
8023 * inversion list. But if there is an offset the logical number is one
8025 invlist_set_len(invlist, length - offset, offset);
8027 invlist_set_previous_index(invlist, 0);
8029 /* Initialize the iteration pointer. */
8030 invlist_iterfinish(invlist);
8032 SvREADONLY_on(invlist);
8036 #endif /* ifndef PERL_IN_XSUB_RE */
8039 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8041 /* Grow the maximum size of an inversion list */
8043 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8045 assert(SvTYPE(invlist) == SVt_INVLIST);
8047 /* Add one to account for the zero element at the beginning which may not
8048 * be counted by the calling parameters */
8049 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8052 PERL_STATIC_INLINE void
8053 S_invlist_trim(SV* const invlist)
8055 PERL_ARGS_ASSERT_INVLIST_TRIM;
8057 assert(SvTYPE(invlist) == SVt_INVLIST);
8059 /* Change the length of the inversion list to how many entries it currently
8061 SvPV_shrink_to_cur((SV *) invlist);
8065 S__append_range_to_invlist(pTHX_ SV* const invlist,
8066 const UV start, const UV end)
8068 /* Subject to change or removal. Append the range from 'start' to 'end' at
8069 * the end of the inversion list. The range must be above any existing
8073 UV max = invlist_max(invlist);
8074 UV len = _invlist_len(invlist);
8077 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8079 if (len == 0) { /* Empty lists must be initialized */
8080 offset = start != 0;
8081 array = _invlist_array_init(invlist, ! offset);
8084 /* Here, the existing list is non-empty. The current max entry in the
8085 * list is generally the first value not in the set, except when the
8086 * set extends to the end of permissible values, in which case it is
8087 * the first entry in that final set, and so this call is an attempt to
8088 * append out-of-order */
8090 UV final_element = len - 1;
8091 array = invlist_array(invlist);
8092 if (array[final_element] > start
8093 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8095 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8096 array[final_element], start,
8097 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8100 /* Here, it is a legal append. If the new range begins with the first
8101 * value not in the set, it is extending the set, so the new first
8102 * value not in the set is one greater than the newly extended range.
8104 offset = *get_invlist_offset_addr(invlist);
8105 if (array[final_element] == start) {
8106 if (end != UV_MAX) {
8107 array[final_element] = end + 1;
8110 /* But if the end is the maximum representable on the machine,
8111 * just let the range that this would extend to have no end */
8112 invlist_set_len(invlist, len - 1, offset);
8118 /* Here the new range doesn't extend any existing set. Add it */
8120 len += 2; /* Includes an element each for the start and end of range */
8122 /* If wll overflow the existing space, extend, which may cause the array to
8125 invlist_extend(invlist, len);
8127 /* Have to set len here to avoid assert failure in invlist_array() */
8128 invlist_set_len(invlist, len, offset);
8130 array = invlist_array(invlist);
8133 invlist_set_len(invlist, len, offset);
8136 /* The next item on the list starts the range, the one after that is
8137 * one past the new range. */
8138 array[len - 2] = start;
8139 if (end != UV_MAX) {
8140 array[len - 1] = end + 1;
8143 /* But if the end is the maximum representable on the machine, just let
8144 * the range have no end */
8145 invlist_set_len(invlist, len - 1, offset);
8149 #ifndef PERL_IN_XSUB_RE
8152 Perl__invlist_search(SV* const invlist, const UV cp)
8154 /* Searches the inversion list for the entry that contains the input code
8155 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8156 * return value is the index into the list's array of the range that
8161 IV high = _invlist_len(invlist);
8162 const IV highest_element = high - 1;
8165 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8167 /* If list is empty, return failure. */
8172 /* (We can't get the array unless we know the list is non-empty) */
8173 array = invlist_array(invlist);
8175 mid = invlist_previous_index(invlist);
8176 assert(mid >=0 && mid <= highest_element);
8178 /* <mid> contains the cache of the result of the previous call to this
8179 * function (0 the first time). See if this call is for the same result,
8180 * or if it is for mid-1. This is under the theory that calls to this
8181 * function will often be for related code points that are near each other.
8182 * And benchmarks show that caching gives better results. We also test
8183 * here if the code point is within the bounds of the list. These tests
8184 * replace others that would have had to be made anyway to make sure that
8185 * the array bounds were not exceeded, and these give us extra information
8186 * at the same time */
8187 if (cp >= array[mid]) {
8188 if (cp >= array[highest_element]) {
8189 return highest_element;
8192 /* Here, array[mid] <= cp < array[highest_element]. This means that
8193 * the final element is not the answer, so can exclude it; it also
8194 * means that <mid> is not the final element, so can refer to 'mid + 1'
8196 if (cp < array[mid + 1]) {
8202 else { /* cp < aray[mid] */
8203 if (cp < array[0]) { /* Fail if outside the array */
8207 if (cp >= array[mid - 1]) {
8212 /* Binary search. What we are looking for is <i> such that
8213 * array[i] <= cp < array[i+1]
8214 * The loop below converges on the i+1. Note that there may not be an
8215 * (i+1)th element in the array, and things work nonetheless */
8216 while (low < high) {
8217 mid = (low + high) / 2;
8218 assert(mid <= highest_element);
8219 if (array[mid] <= cp) { /* cp >= array[mid] */
8222 /* We could do this extra test to exit the loop early.
8223 if (cp < array[low]) {
8228 else { /* cp < array[mid] */
8235 invlist_set_previous_index(invlist, high);
8240 Perl__invlist_populate_swatch(SV* const invlist,
8241 const UV start, const UV end, U8* swatch)
8243 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8244 * but is used when the swash has an inversion list. This makes this much
8245 * faster, as it uses a binary search instead of a linear one. This is
8246 * intimately tied to that function, and perhaps should be in utf8.c,
8247 * except it is intimately tied to inversion lists as well. It assumes
8248 * that <swatch> is all 0's on input */
8251 const IV len = _invlist_len(invlist);
8255 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8257 if (len == 0) { /* Empty inversion list */
8261 array = invlist_array(invlist);
8263 /* Find which element it is */
8264 i = _invlist_search(invlist, start);
8266 /* We populate from <start> to <end> */
8267 while (current < end) {
8270 /* The inversion list gives the results for every possible code point
8271 * after the first one in the list. Only those ranges whose index is
8272 * even are ones that the inversion list matches. For the odd ones,
8273 * and if the initial code point is not in the list, we have to skip
8274 * forward to the next element */
8275 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8277 if (i >= len) { /* Finished if beyond the end of the array */
8281 if (current >= end) { /* Finished if beyond the end of what we
8283 if (LIKELY(end < UV_MAX)) {
8287 /* We get here when the upper bound is the maximum
8288 * representable on the machine, and we are looking for just
8289 * that code point. Have to special case it */
8291 goto join_end_of_list;
8294 assert(current >= start);
8296 /* The current range ends one below the next one, except don't go past
8299 upper = (i < len && array[i] < end) ? array[i] : end;
8301 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8302 * for each code point in it */
8303 for (; current < upper; current++) {
8304 const STRLEN offset = (STRLEN)(current - start);
8305 swatch[offset >> 3] |= 1 << (offset & 7);
8310 /* Quit if at the end of the list */
8313 /* But first, have to deal with the highest possible code point on
8314 * the platform. The previous code assumes that <end> is one
8315 * beyond where we want to populate, but that is impossible at the
8316 * platform's infinity, so have to handle it specially */
8317 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8319 const STRLEN offset = (STRLEN)(end - start);
8320 swatch[offset >> 3] |= 1 << (offset & 7);
8325 /* Advance to the next range, which will be for code points not in the
8334 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8335 const bool complement_b, SV** output)
8337 /* Take the union of two inversion lists and point <output> to it. *output
8338 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8339 * the reference count to that list will be decremented if not already a
8340 * temporary (mortal); otherwise *output will be made correspondingly
8341 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8342 * second list is returned. If <complement_b> is TRUE, the union is taken
8343 * of the complement (inversion) of <b> instead of b itself.
8345 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8346 * Richard Gillam, published by Addison-Wesley, and explained at some
8347 * length there. The preface says to incorporate its examples into your
8348 * code at your own risk.
8350 * The algorithm is like a merge sort.
8352 * XXX A potential performance improvement is to keep track as we go along
8353 * if only one of the inputs contributes to the result, meaning the other
8354 * is a subset of that one. In that case, we can skip the final copy and
8355 * return the larger of the input lists, but then outside code might need
8356 * to keep track of whether to free the input list or not */
8358 const UV* array_a; /* a's array */
8360 UV len_a; /* length of a's array */
8363 SV* u; /* the resulting union */
8367 UV i_a = 0; /* current index into a's array */
8371 /* running count, as explained in the algorithm source book; items are
8372 * stopped accumulating and are output when the count changes to/from 0.
8373 * The count is incremented when we start a range that's in the set, and
8374 * decremented when we start a range that's not in the set. So its range
8375 * is 0 to 2. Only when the count is zero is something not in the set.
8379 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8382 /* If either one is empty, the union is the other one */
8383 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8384 bool make_temp = FALSE; /* Should we mortalize the result? */
8388 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8394 *output = invlist_clone(b);
8396 _invlist_invert(*output);
8398 } /* else *output already = b; */
8401 sv_2mortal(*output);
8405 else if ((len_b = _invlist_len(b)) == 0) {
8406 bool make_temp = FALSE;
8408 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8413 /* The complement of an empty list is a list that has everything in it,
8414 * so the union with <a> includes everything too */
8417 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8421 *output = _new_invlist(1);
8422 _append_range_to_invlist(*output, 0, UV_MAX);
8424 else if (*output != a) {
8425 *output = invlist_clone(a);
8427 /* else *output already = a; */
8430 sv_2mortal(*output);
8435 /* Here both lists exist and are non-empty */
8436 array_a = invlist_array(a);
8437 array_b = invlist_array(b);
8439 /* If are to take the union of 'a' with the complement of b, set it
8440 * up so are looking at b's complement. */
8443 /* To complement, we invert: if the first element is 0, remove it. To
8444 * do this, we just pretend the array starts one later */
8445 if (array_b[0] == 0) {
8451 /* But if the first element is not zero, we pretend the list starts
8452 * at the 0 that is always stored immediately before the array. */
8458 /* Size the union for the worst case: that the sets are completely
8460 u = _new_invlist(len_a + len_b);
8462 /* Will contain U+0000 if either component does */
8463 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8464 || (len_b > 0 && array_b[0] == 0));
8466 /* Go through each list item by item, stopping when exhausted one of
8468 while (i_a < len_a && i_b < len_b) {
8469 UV cp; /* The element to potentially add to the union's array */
8470 bool cp_in_set; /* is it in the the input list's set or not */
8472 /* We need to take one or the other of the two inputs for the union.
8473 * Since we are merging two sorted lists, we take the smaller of the
8474 * next items. In case of a tie, we take the one that is in its set
8475 * first. If we took one not in the set first, it would decrement the
8476 * count, possibly to 0 which would cause it to be output as ending the
8477 * range, and the next time through we would take the same number, and
8478 * output it again as beginning the next range. By doing it the
8479 * opposite way, there is no possibility that the count will be
8480 * momentarily decremented to 0, and thus the two adjoining ranges will
8481 * be seamlessly merged. (In a tie and both are in the set or both not
8482 * in the set, it doesn't matter which we take first.) */
8483 if (array_a[i_a] < array_b[i_b]
8484 || (array_a[i_a] == array_b[i_b]
8485 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8487 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8491 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8492 cp = array_b[i_b++];
8495 /* Here, have chosen which of the two inputs to look at. Only output
8496 * if the running count changes to/from 0, which marks the
8497 * beginning/end of a range in that's in the set */
8500 array_u[i_u++] = cp;
8507 array_u[i_u++] = cp;
8512 /* Here, we are finished going through at least one of the lists, which
8513 * means there is something remaining in at most one. We check if the list
8514 * that hasn't been exhausted is positioned such that we are in the middle
8515 * of a range in its set or not. (i_a and i_b point to the element beyond
8516 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8517 * is potentially more to output.
8518 * There are four cases:
8519 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8520 * in the union is entirely from the non-exhausted set.
8521 * 2) Both were in their sets, count is 2. Nothing further should
8522 * be output, as everything that remains will be in the exhausted
8523 * list's set, hence in the union; decrementing to 1 but not 0 insures
8525 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8526 * Nothing further should be output because the union includes
8527 * everything from the exhausted set. Not decrementing ensures that.
8528 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8529 * decrementing to 0 insures that we look at the remainder of the
8530 * non-exhausted set */
8531 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8532 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8537 /* The final length is what we've output so far, plus what else is about to
8538 * be output. (If 'count' is non-zero, then the input list we exhausted
8539 * has everything remaining up to the machine's limit in its set, and hence
8540 * in the union, so there will be no further output. */
8543 /* At most one of the subexpressions will be non-zero */
8544 len_u += (len_a - i_a) + (len_b - i_b);
8547 /* Set result to final length, which can change the pointer to array_u, so
8549 if (len_u != _invlist_len(u)) {
8550 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8552 array_u = invlist_array(u);
8555 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8556 * the other) ended with everything above it not in its set. That means
8557 * that the remaining part of the union is precisely the same as the
8558 * non-exhausted list, so can just copy it unchanged. (If both list were
8559 * exhausted at the same time, then the operations below will be both 0.)
8562 IV copy_count; /* At most one will have a non-zero copy count */
8563 if ((copy_count = len_a - i_a) > 0) {
8564 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8566 else if ((copy_count = len_b - i_b) > 0) {
8567 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8571 /* We may be removing a reference to one of the inputs. If so, the output
8572 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8573 * count decremented) */
8574 if (a == *output || b == *output) {
8575 assert(! invlist_is_iterating(*output));
8576 if ((SvTEMP(*output))) {
8580 SvREFCNT_dec_NN(*output);
8590 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8591 const bool complement_b, SV** i)
8593 /* Take the intersection of two inversion lists and point <i> to it. *i
8594 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8595 * the reference count to that list will be decremented if not already a
8596 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8597 * The first list, <a>, may be NULL, in which case an empty list is
8598 * returned. If <complement_b> is TRUE, the result will be the
8599 * intersection of <a> and the complement (or inversion) of <b> instead of
8602 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8603 * Richard Gillam, published by Addison-Wesley, and explained at some
8604 * length there. The preface says to incorporate its examples into your
8605 * code at your own risk. In fact, it had bugs
8607 * The algorithm is like a merge sort, and is essentially the same as the
8611 const UV* array_a; /* a's array */
8613 UV len_a; /* length of a's array */
8616 SV* r; /* the resulting intersection */
8620 UV i_a = 0; /* current index into a's array */
8624 /* running count, as explained in the algorithm source book; items are
8625 * stopped accumulating and are output when the count changes to/from 2.
8626 * The count is incremented when we start a range that's in the set, and
8627 * decremented when we start a range that's not in the set. So its range
8628 * is 0 to 2. Only when the count is 2 is something in the intersection.
8632 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8635 /* Special case if either one is empty */
8636 len_a = (a == NULL) ? 0 : _invlist_len(a);
8637 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8638 bool make_temp = FALSE;
8640 if (len_a != 0 && complement_b) {
8642 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8643 * be empty. Here, also we are using 'b's complement, which hence
8644 * must be every possible code point. Thus the intersection is
8648 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8653 *i = invlist_clone(a);
8655 /* else *i is already 'a' */
8663 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8664 * intersection must be empty */
8666 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8671 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8675 *i = _new_invlist(0);
8683 /* Here both lists exist and are non-empty */
8684 array_a = invlist_array(a);
8685 array_b = invlist_array(b);
8687 /* If are to take the intersection of 'a' with the complement of b, set it
8688 * up so are looking at b's complement. */
8691 /* To complement, we invert: if the first element is 0, remove it. To
8692 * do this, we just pretend the array starts one later */
8693 if (array_b[0] == 0) {
8699 /* But if the first element is not zero, we pretend the list starts
8700 * at the 0 that is always stored immediately before the array. */
8706 /* Size the intersection for the worst case: that the intersection ends up
8707 * fragmenting everything to be completely disjoint */
8708 r= _new_invlist(len_a + len_b);
8710 /* Will contain U+0000 iff both components do */
8711 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8712 && len_b > 0 && array_b[0] == 0);
8714 /* Go through each list item by item, stopping when exhausted one of
8716 while (i_a < len_a && i_b < len_b) {
8717 UV cp; /* The element to potentially add to the intersection's
8719 bool cp_in_set; /* Is it in the input list's set or not */
8721 /* We need to take one or the other of the two inputs for the
8722 * intersection. Since we are merging two sorted lists, we take the
8723 * smaller of the next items. In case of a tie, we take the one that
8724 * is not in its set first (a difference from the union algorithm). If
8725 * we took one in the set first, it would increment the count, possibly
8726 * to 2 which would cause it to be output as starting a range in the
8727 * intersection, and the next time through we would take that same
8728 * number, and output it again as ending the set. By doing it the
8729 * opposite of this, there is no possibility that the count will be
8730 * momentarily incremented to 2. (In a tie and both are in the set or
8731 * both not in the set, it doesn't matter which we take first.) */
8732 if (array_a[i_a] < array_b[i_b]
8733 || (array_a[i_a] == array_b[i_b]
8734 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8736 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8740 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8744 /* Here, have chosen which of the two inputs to look at. Only output
8745 * if the running count changes to/from 2, which marks the
8746 * beginning/end of a range that's in the intersection */
8750 array_r[i_r++] = cp;
8755 array_r[i_r++] = cp;
8761 /* Here, we are finished going through at least one of the lists, which
8762 * means there is something remaining in at most one. We check if the list
8763 * that has been exhausted is positioned such that we are in the middle
8764 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8765 * the ones we care about.) There are four cases:
8766 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8767 * nothing left in the intersection.
8768 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8769 * above 2. What should be output is exactly that which is in the
8770 * non-exhausted set, as everything it has is also in the intersection
8771 * set, and everything it doesn't have can't be in the intersection
8772 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8773 * gets incremented to 2. Like the previous case, the intersection is
8774 * everything that remains in the non-exhausted set.
8775 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8776 * remains 1. And the intersection has nothing more. */
8777 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8778 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8783 /* The final length is what we've output so far plus what else is in the
8784 * intersection. At most one of the subexpressions below will be non-zero
8788 len_r += (len_a - i_a) + (len_b - i_b);
8791 /* Set result to final length, which can change the pointer to array_r, so
8793 if (len_r != _invlist_len(r)) {
8794 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8796 array_r = invlist_array(r);
8799 /* Finish outputting any remaining */
8800 if (count >= 2) { /* At most one will have a non-zero copy count */
8802 if ((copy_count = len_a - i_a) > 0) {
8803 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8805 else if ((copy_count = len_b - i_b) > 0) {
8806 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8810 /* We may be removing a reference to one of the inputs. If so, the output
8811 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8812 * count decremented) */
8813 if (a == *i || b == *i) {
8814 assert(! invlist_is_iterating(*i));
8819 SvREFCNT_dec_NN(*i);
8829 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8831 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8832 * set. A pointer to the inversion list is returned. This may actually be
8833 * a new list, in which case the passed in one has been destroyed. The
8834 * passed in inversion list can be NULL, in which case a new one is created
8835 * with just the one range in it */
8840 if (invlist == NULL) {
8841 invlist = _new_invlist(2);
8845 len = _invlist_len(invlist);
8848 /* If comes after the final entry actually in the list, can just append it
8851 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8852 && start >= invlist_array(invlist)[len - 1]))
8854 _append_range_to_invlist(invlist, start, end);
8858 /* Here, can't just append things, create and return a new inversion list
8859 * which is the union of this range and the existing inversion list */
8860 range_invlist = _new_invlist(2);
8861 _append_range_to_invlist(range_invlist, start, end);
8863 _invlist_union(invlist, range_invlist, &invlist);
8865 /* The temporary can be freed */
8866 SvREFCNT_dec_NN(range_invlist);
8872 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8873 UV** other_elements_ptr)
8875 /* Create and return an inversion list whose contents are to be populated
8876 * by the caller. The caller gives the number of elements (in 'size') and
8877 * the very first element ('element0'). This function will set
8878 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8881 * Obviously there is some trust involved that the caller will properly
8882 * fill in the other elements of the array.
8884 * (The first element needs to be passed in, as the underlying code does
8885 * things differently depending on whether it is zero or non-zero) */
8887 SV* invlist = _new_invlist(size);
8890 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8892 _append_range_to_invlist(invlist, element0, element0);
8893 offset = *get_invlist_offset_addr(invlist);
8895 invlist_set_len(invlist, size, offset);
8896 *other_elements_ptr = invlist_array(invlist) + 1;
8902 PERL_STATIC_INLINE SV*
8903 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8904 return _add_range_to_invlist(invlist, cp, cp);
8907 #ifndef PERL_IN_XSUB_RE
8909 Perl__invlist_invert(pTHX_ SV* const invlist)
8911 /* Complement the input inversion list. This adds a 0 if the list didn't
8912 * have a zero; removes it otherwise. As described above, the data
8913 * structure is set up so that this is very efficient */
8915 PERL_ARGS_ASSERT__INVLIST_INVERT;
8917 assert(! invlist_is_iterating(invlist));
8919 /* The inverse of matching nothing is matching everything */
8920 if (_invlist_len(invlist) == 0) {
8921 _append_range_to_invlist(invlist, 0, UV_MAX);
8925 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8930 PERL_STATIC_INLINE SV*
8931 S_invlist_clone(pTHX_ SV* const invlist)
8934 /* Return a new inversion list that is a copy of the input one, which is
8935 * unchanged. The new list will not be mortal even if the old one was. */
8937 /* Need to allocate extra space to accommodate Perl's addition of a
8938 * trailing NUL to SvPV's, since it thinks they are always strings */
8939 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8940 STRLEN physical_length = SvCUR(invlist);
8941 bool offset = *(get_invlist_offset_addr(invlist));
8943 PERL_ARGS_ASSERT_INVLIST_CLONE;
8945 *(get_invlist_offset_addr(new_invlist)) = offset;
8946 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8947 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8952 PERL_STATIC_INLINE STRLEN*
8953 S_get_invlist_iter_addr(SV* invlist)
8955 /* Return the address of the UV that contains the current iteration
8958 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8960 assert(SvTYPE(invlist) == SVt_INVLIST);
8962 return &(((XINVLIST*) SvANY(invlist))->iterator);
8965 PERL_STATIC_INLINE void
8966 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8968 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8970 *get_invlist_iter_addr(invlist) = 0;
8973 PERL_STATIC_INLINE void
8974 S_invlist_iterfinish(SV* invlist)
8976 /* Terminate iterator for invlist. This is to catch development errors.
8977 * Any iteration that is interrupted before completed should call this
8978 * function. Functions that add code points anywhere else but to the end
8979 * of an inversion list assert that they are not in the middle of an
8980 * iteration. If they were, the addition would make the iteration
8981 * problematical: if the iteration hadn't reached the place where things
8982 * were being added, it would be ok */
8984 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8986 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8990 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8992 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8993 * This call sets in <*start> and <*end>, the next range in <invlist>.
8994 * Returns <TRUE> if successful and the next call will return the next
8995 * range; <FALSE> if was already at the end of the list. If the latter,
8996 * <*start> and <*end> are unchanged, and the next call to this function
8997 * will start over at the beginning of the list */
8999 STRLEN* pos = get_invlist_iter_addr(invlist);
9000 UV len = _invlist_len(invlist);
9003 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9006 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9010 array = invlist_array(invlist);
9012 *start = array[(*pos)++];
9018 *end = array[(*pos)++] - 1;
9024 PERL_STATIC_INLINE bool
9025 S_invlist_is_iterating(SV* const invlist)
9027 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9029 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9032 PERL_STATIC_INLINE UV
9033 S_invlist_highest(SV* const invlist)
9035 /* Returns the highest code point that matches an inversion list. This API
9036 * has an ambiguity, as it returns 0 under either the highest is actually
9037 * 0, or if the list is empty. If this distinction matters to you, check
9038 * for emptiness before calling this function */
9040 UV len = _invlist_len(invlist);
9043 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9049 array = invlist_array(invlist);
9051 /* The last element in the array in the inversion list always starts a
9052 * range that goes to infinity. That range may be for code points that are
9053 * matched in the inversion list, or it may be for ones that aren't
9054 * matched. In the latter case, the highest code point in the set is one
9055 * less than the beginning of this range; otherwise it is the final element
9056 * of this range: infinity */
9057 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9059 : array[len - 1] - 1;
9062 #ifndef PERL_IN_XSUB_RE
9064 Perl__invlist_contents(pTHX_ SV* const invlist)
9066 /* Get the contents of an inversion list into a string SV so that they can
9067 * be printed out. It uses the format traditionally done for debug tracing
9071 SV* output = newSVpvs("\n");
9073 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9075 assert(! invlist_is_iterating(invlist));
9077 invlist_iterinit(invlist);
9078 while (invlist_iternext(invlist, &start, &end)) {
9079 if (end == UV_MAX) {
9080 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9082 else if (end != start) {
9083 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9087 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9095 #ifndef PERL_IN_XSUB_RE
9097 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9098 const char * const indent, SV* const invlist)
9100 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9101 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9102 * the string 'indent'. The output looks like this:
9103 [0] 0x000A .. 0x000D
9105 [4] 0x2028 .. 0x2029
9106 [6] 0x3104 .. INFINITY
9107 * This means that the first range of code points matched by the list are
9108 * 0xA through 0xD; the second range contains only the single code point
9109 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9110 * are used to define each range (except if the final range extends to
9111 * infinity, only a single element is needed). The array index of the
9112 * first element for the corresponding range is given in brackets. */
9117 PERL_ARGS_ASSERT__INVLIST_DUMP;
9119 if (invlist_is_iterating(invlist)) {
9120 Perl_dump_indent(aTHX_ level, file,
9121 "%sCan't dump inversion list because is in middle of iterating\n",
9126 invlist_iterinit(invlist);
9127 while (invlist_iternext(invlist, &start, &end)) {
9128 if (end == UV_MAX) {
9129 Perl_dump_indent(aTHX_ level, file,
9130 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9131 indent, (UV)count, start);
9133 else if (end != start) {
9134 Perl_dump_indent(aTHX_ level, file,
9135 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9136 indent, (UV)count, start, end);
9139 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9140 indent, (UV)count, start);
9147 Perl__load_PL_utf8_foldclosures (pTHX)
9149 assert(! PL_utf8_foldclosures);
9151 /* If the folds haven't been read in, call a fold function
9153 if (! PL_utf8_tofold) {
9154 U8 dummy[UTF8_MAXBYTES_CASE+1];
9156 /* This string is just a short named one above \xff */
9157 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9158 assert(PL_utf8_tofold); /* Verify that worked */
9160 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9164 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9166 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9168 /* Return a boolean as to if the two passed in inversion lists are
9169 * identical. The final argument, if TRUE, says to take the complement of
9170 * the second inversion list before doing the comparison */
9172 const UV* array_a = invlist_array(a);
9173 const UV* array_b = invlist_array(b);
9174 UV len_a = _invlist_len(a);
9175 UV len_b = _invlist_len(b);
9177 UV i = 0; /* current index into the arrays */
9178 bool retval = TRUE; /* Assume are identical until proven otherwise */
9180 PERL_ARGS_ASSERT__INVLISTEQ;
9182 /* If are to compare 'a' with the complement of b, set it
9183 * up so are looking at b's complement. */
9186 /* The complement of nothing is everything, so <a> would have to have
9187 * just one element, starting at zero (ending at infinity) */
9189 return (len_a == 1 && array_a[0] == 0);
9191 else if (array_b[0] == 0) {
9193 /* Otherwise, to complement, we invert. Here, the first element is
9194 * 0, just remove it. To do this, we just pretend the array starts
9202 /* But if the first element is not zero, we pretend the list starts
9203 * at the 0 that is always stored immediately before the array. */
9209 /* Make sure that the lengths are the same, as well as the final element
9210 * before looping through the remainder. (Thus we test the length, final,
9211 * and first elements right off the bat) */
9212 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9215 else for (i = 0; i < len_a - 1; i++) {
9216 if (array_a[i] != array_b[i]) {
9226 #undef HEADER_LENGTH
9227 #undef TO_INTERNAL_SIZE
9228 #undef FROM_INTERNAL_SIZE
9229 #undef INVLIST_VERSION_ID
9231 /* End of inversion list object */
9234 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9236 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9237 * constructs, and updates RExC_flags with them. On input, RExC_parse
9238 * should point to the first flag; it is updated on output to point to the
9239 * final ')' or ':'. There needs to be at least one flag, or this will
9242 /* for (?g), (?gc), and (?o) warnings; warning
9243 about (?c) will warn about (?g) -- japhy */
9245 #define WASTED_O 0x01
9246 #define WASTED_G 0x02
9247 #define WASTED_C 0x04
9248 #define WASTED_GC (WASTED_G|WASTED_C)
9249 I32 wastedflags = 0x00;
9250 U32 posflags = 0, negflags = 0;
9251 U32 *flagsp = &posflags;
9252 char has_charset_modifier = '\0';
9254 bool has_use_defaults = FALSE;
9255 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9257 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9259 /* '^' as an initial flag sets certain defaults */
9260 if (UCHARAT(RExC_parse) == '^') {
9262 has_use_defaults = TRUE;
9263 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9264 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9265 ? REGEX_UNICODE_CHARSET
9266 : REGEX_DEPENDS_CHARSET);
9269 cs = get_regex_charset(RExC_flags);
9270 if (cs == REGEX_DEPENDS_CHARSET
9271 && (RExC_utf8 || RExC_uni_semantics))
9273 cs = REGEX_UNICODE_CHARSET;
9276 while (*RExC_parse) {
9277 /* && strchr("iogcmsx", *RExC_parse) */
9278 /* (?g), (?gc) and (?o) are useless here
9279 and must be globally applied -- japhy */
9280 switch (*RExC_parse) {
9282 /* Code for the imsx flags */
9283 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9285 case LOCALE_PAT_MOD:
9286 if (has_charset_modifier) {
9287 goto excess_modifier;
9289 else if (flagsp == &negflags) {
9292 cs = REGEX_LOCALE_CHARSET;
9293 has_charset_modifier = LOCALE_PAT_MOD;
9295 case UNICODE_PAT_MOD:
9296 if (has_charset_modifier) {
9297 goto excess_modifier;
9299 else if (flagsp == &negflags) {
9302 cs = REGEX_UNICODE_CHARSET;
9303 has_charset_modifier = UNICODE_PAT_MOD;
9305 case ASCII_RESTRICT_PAT_MOD:
9306 if (flagsp == &negflags) {
9309 if (has_charset_modifier) {
9310 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9311 goto excess_modifier;
9313 /* Doubled modifier implies more restricted */
9314 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9317 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9319 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9321 case DEPENDS_PAT_MOD:
9322 if (has_use_defaults) {
9323 goto fail_modifiers;
9325 else if (flagsp == &negflags) {
9328 else if (has_charset_modifier) {
9329 goto excess_modifier;
9332 /* The dual charset means unicode semantics if the
9333 * pattern (or target, not known until runtime) are
9334 * utf8, or something in the pattern indicates unicode
9336 cs = (RExC_utf8 || RExC_uni_semantics)
9337 ? REGEX_UNICODE_CHARSET
9338 : REGEX_DEPENDS_CHARSET;
9339 has_charset_modifier = DEPENDS_PAT_MOD;
9343 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9344 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9346 else if (has_charset_modifier == *(RExC_parse - 1)) {
9347 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9351 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9356 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9359 case ONCE_PAT_MOD: /* 'o' */
9360 case GLOBAL_PAT_MOD: /* 'g' */
9361 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9362 const I32 wflagbit = *RExC_parse == 'o'
9365 if (! (wastedflags & wflagbit) ) {
9366 wastedflags |= wflagbit;
9367 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9370 "Useless (%s%c) - %suse /%c modifier",
9371 flagsp == &negflags ? "?-" : "?",
9373 flagsp == &negflags ? "don't " : "",
9380 case CONTINUE_PAT_MOD: /* 'c' */
9381 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9382 if (! (wastedflags & WASTED_C) ) {
9383 wastedflags |= WASTED_GC;
9384 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9387 "Useless (%sc) - %suse /gc modifier",
9388 flagsp == &negflags ? "?-" : "?",
9389 flagsp == &negflags ? "don't " : ""
9394 case KEEPCOPY_PAT_MOD: /* 'p' */
9395 if (flagsp == &negflags) {
9397 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9399 *flagsp |= RXf_PMf_KEEPCOPY;
9403 /* A flag is a default iff it is following a minus, so
9404 * if there is a minus, it means will be trying to
9405 * re-specify a default which is an error */
9406 if (has_use_defaults || flagsp == &negflags) {
9407 goto fail_modifiers;
9410 wastedflags = 0; /* reset so (?g-c) warns twice */
9414 RExC_flags |= posflags;
9415 RExC_flags &= ~negflags;
9416 set_regex_charset(&RExC_flags, cs);
9417 if (RExC_flags & RXf_PMf_FOLD) {
9418 RExC_contains_i = 1;
9424 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9425 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9426 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9427 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9436 - reg - regular expression, i.e. main body or parenthesized thing
9438 * Caller must absorb opening parenthesis.
9440 * Combining parenthesis handling with the base level of regular expression
9441 * is a trifle forced, but the need to tie the tails of the branches to what
9442 * follows makes it hard to avoid.
9444 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9446 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9448 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9451 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9452 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9453 needs to be restarted.
9454 Otherwise would only return NULL if regbranch() returns NULL, which
9457 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9458 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9459 * 2 is like 1, but indicates that nextchar() has been called to advance
9460 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9461 * this flag alerts us to the need to check for that */
9463 regnode *ret; /* Will be the head of the group. */
9466 regnode *ender = NULL;
9469 U32 oregflags = RExC_flags;
9470 bool have_branch = 0;
9472 I32 freeze_paren = 0;
9473 I32 after_freeze = 0;
9474 I32 num; /* numeric backreferences */
9476 char * parse_start = RExC_parse; /* MJD */
9477 char * const oregcomp_parse = RExC_parse;
9479 GET_RE_DEBUG_FLAGS_DECL;
9481 PERL_ARGS_ASSERT_REG;
9482 DEBUG_PARSE("reg ");
9484 *flagp = 0; /* Tentatively. */
9487 /* Make an OPEN node, if parenthesized. */
9490 /* Under /x, space and comments can be gobbled up between the '(' and
9491 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9492 * intervening space, as the sequence is a token, and a token should be
9494 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9496 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9497 char *start_verb = RExC_parse;
9498 STRLEN verb_len = 0;
9499 char *start_arg = NULL;
9500 unsigned char op = 0;
9502 int internal_argval = 0; /* internal_argval is only useful if
9505 if (has_intervening_patws) {
9507 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9509 while ( *RExC_parse && *RExC_parse != ')' ) {
9510 if ( *RExC_parse == ':' ) {
9511 start_arg = RExC_parse + 1;
9517 verb_len = RExC_parse - start_verb;
9520 while ( *RExC_parse && *RExC_parse != ')' )
9522 if ( *RExC_parse != ')' )
9523 vFAIL("Unterminated verb pattern argument");
9524 if ( RExC_parse == start_arg )
9527 if ( *RExC_parse != ')' )
9528 vFAIL("Unterminated verb pattern");
9531 switch ( *start_verb ) {
9532 case 'A': /* (*ACCEPT) */
9533 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9535 internal_argval = RExC_nestroot;
9538 case 'C': /* (*COMMIT) */
9539 if ( memEQs(start_verb,verb_len,"COMMIT") )
9542 case 'F': /* (*FAIL) */
9543 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9548 case ':': /* (*:NAME) */
9549 case 'M': /* (*MARK:NAME) */
9550 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9555 case 'P': /* (*PRUNE) */
9556 if ( memEQs(start_verb,verb_len,"PRUNE") )
9559 case 'S': /* (*SKIP) */
9560 if ( memEQs(start_verb,verb_len,"SKIP") )
9563 case 'T': /* (*THEN) */
9564 /* [19:06] <TimToady> :: is then */
9565 if ( memEQs(start_verb,verb_len,"THEN") ) {
9567 RExC_seen |= REG_CUTGROUP_SEEN;
9572 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9574 "Unknown verb pattern '%"UTF8f"'",
9575 UTF8fARG(UTF, verb_len, start_verb));
9578 if ( start_arg && internal_argval ) {
9579 vFAIL3("Verb pattern '%.*s' may not have an argument",
9580 verb_len, start_verb);
9581 } else if ( argok < 0 && !start_arg ) {
9582 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9583 verb_len, start_verb);
9585 ret = reganode(pRExC_state, op, internal_argval);
9586 if ( ! internal_argval && ! SIZE_ONLY ) {
9588 SV *sv = newSVpvn( start_arg,
9589 RExC_parse - start_arg);
9590 ARG(ret) = add_data( pRExC_state,
9592 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9599 if (!internal_argval)
9600 RExC_seen |= REG_VERBARG_SEEN;
9601 } else if ( start_arg ) {
9602 vFAIL3("Verb pattern '%.*s' may not have an argument",
9603 verb_len, start_verb);
9605 ret = reg_node(pRExC_state, op);
9607 nextchar(pRExC_state);
9610 else if (*RExC_parse == '?') { /* (?...) */
9611 bool is_logical = 0;
9612 const char * const seqstart = RExC_parse;
9613 const char * endptr;
9614 if (has_intervening_patws) {
9616 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9620 paren = *RExC_parse++;
9621 ret = NULL; /* For look-ahead/behind. */
9624 case 'P': /* (?P...) variants for those used to PCRE/Python */
9625 paren = *RExC_parse++;
9626 if ( paren == '<') /* (?P<...>) named capture */
9628 else if (paren == '>') { /* (?P>name) named recursion */
9629 goto named_recursion;
9631 else if (paren == '=') { /* (?P=...) named backref */
9632 /* this pretty much dupes the code for \k<NAME> in
9633 * regatom(), if you change this make sure you change that
9635 char* name_start = RExC_parse;
9637 SV *sv_dat = reg_scan_name(pRExC_state,
9638 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9639 if (RExC_parse == name_start || *RExC_parse != ')')
9640 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9641 vFAIL2("Sequence %.3s... not terminated",parse_start);
9644 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9645 RExC_rxi->data->data[num]=(void*)sv_dat;
9646 SvREFCNT_inc_simple_void(sv_dat);
9649 ret = reganode(pRExC_state,
9652 : (ASCII_FOLD_RESTRICTED)
9654 : (AT_LEAST_UNI_SEMANTICS)
9662 Set_Node_Offset(ret, parse_start+1);
9663 Set_Node_Cur_Length(ret, parse_start);
9665 nextchar(pRExC_state);
9669 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9670 vFAIL3("Sequence (%.*s...) not recognized",
9671 RExC_parse-seqstart, seqstart);
9673 case '<': /* (?<...) */
9674 if (*RExC_parse == '!')
9676 else if (*RExC_parse != '=')
9682 case '\'': /* (?'...') */
9683 name_start= RExC_parse;
9684 svname = reg_scan_name(pRExC_state,
9685 SIZE_ONLY /* reverse test from the others */
9686 ? REG_RSN_RETURN_NAME
9687 : REG_RSN_RETURN_NULL);
9688 if (RExC_parse == name_start || *RExC_parse != paren)
9689 vFAIL2("Sequence (?%c... not terminated",
9690 paren=='>' ? '<' : paren);
9694 if (!svname) /* shouldn't happen */
9696 "panic: reg_scan_name returned NULL");
9697 if (!RExC_paren_names) {
9698 RExC_paren_names= newHV();
9699 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9701 RExC_paren_name_list= newAV();
9702 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9705 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9707 sv_dat = HeVAL(he_str);
9709 /* croak baby croak */
9711 "panic: paren_name hash element allocation failed");
9712 } else if ( SvPOK(sv_dat) ) {
9713 /* (?|...) can mean we have dupes so scan to check
9714 its already been stored. Maybe a flag indicating
9715 we are inside such a construct would be useful,
9716 but the arrays are likely to be quite small, so
9717 for now we punt -- dmq */
9718 IV count = SvIV(sv_dat);
9719 I32 *pv = (I32*)SvPVX(sv_dat);
9721 for ( i = 0 ; i < count ; i++ ) {
9722 if ( pv[i] == RExC_npar ) {
9728 pv = (I32*)SvGROW(sv_dat,
9729 SvCUR(sv_dat) + sizeof(I32)+1);
9730 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9731 pv[count] = RExC_npar;
9732 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9735 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9736 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9739 SvIV_set(sv_dat, 1);
9742 /* Yes this does cause a memory leak in debugging Perls
9744 if (!av_store(RExC_paren_name_list,
9745 RExC_npar, SvREFCNT_inc(svname)))
9746 SvREFCNT_dec_NN(svname);
9749 /*sv_dump(sv_dat);*/
9751 nextchar(pRExC_state);
9753 goto capturing_parens;
9755 RExC_seen |= REG_LOOKBEHIND_SEEN;
9756 RExC_in_lookbehind++;
9759 case '=': /* (?=...) */
9760 RExC_seen_zerolen++;
9762 case '!': /* (?!...) */
9763 RExC_seen_zerolen++;
9764 if (*RExC_parse == ')') {
9765 ret=reg_node(pRExC_state, OPFAIL);
9766 nextchar(pRExC_state);
9770 case '|': /* (?|...) */
9771 /* branch reset, behave like a (?:...) except that
9772 buffers in alternations share the same numbers */
9774 after_freeze = freeze_paren = RExC_npar;
9776 case ':': /* (?:...) */
9777 case '>': /* (?>...) */
9779 case '$': /* (?$...) */
9780 case '@': /* (?@...) */
9781 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9783 case '0' : /* (?0) */
9784 case 'R' : /* (?R) */
9785 if (*RExC_parse != ')')
9786 FAIL("Sequence (?R) not terminated");
9787 ret = reg_node(pRExC_state, GOSTART);
9788 RExC_seen |= REG_GOSTART_SEEN;
9789 *flagp |= POSTPONED;
9790 nextchar(pRExC_state);
9793 /* named and numeric backreferences */
9794 case '&': /* (?&NAME) */
9795 parse_start = RExC_parse - 1;
9798 SV *sv_dat = reg_scan_name(pRExC_state,
9799 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9800 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9802 if (RExC_parse == RExC_end || *RExC_parse != ')')
9803 vFAIL("Sequence (?&... not terminated");
9804 goto gen_recurse_regop;
9805 assert(0); /* NOT REACHED */
9807 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9809 vFAIL("Illegal pattern");
9811 goto parse_recursion;
9813 case '-': /* (?-1) */
9814 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9815 RExC_parse--; /* rewind to let it be handled later */
9819 case '1': case '2': case '3': case '4': /* (?1) */
9820 case '5': case '6': case '7': case '8': case '9':
9824 bool is_neg = FALSE;
9825 parse_start = RExC_parse - 1; /* MJD */
9826 if (*RExC_parse == '-') {
9830 num = grok_atou(RExC_parse, &endptr);
9832 RExC_parse = (char*)endptr;
9834 /* Some limit for num? */
9838 if (*RExC_parse!=')')
9839 vFAIL("Expecting close bracket");
9842 if ( paren == '-' ) {
9844 Diagram of capture buffer numbering.
9845 Top line is the normal capture buffer numbers
9846 Bottom line is the negative indexing as from
9850 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9854 num = RExC_npar + num;
9857 vFAIL("Reference to nonexistent group");
9859 } else if ( paren == '+' ) {
9860 num = RExC_npar + num - 1;
9863 ret = reganode(pRExC_state, GOSUB, num);
9865 if (num > (I32)RExC_rx->nparens) {
9867 vFAIL("Reference to nonexistent group");
9869 ARG2L_SET( ret, RExC_recurse_count++);
9871 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9872 "Recurse #%"UVuf" to %"IVdf"\n",
9873 (UV)ARG(ret), (IV)ARG2L(ret)));
9877 RExC_seen |= REG_RECURSE_SEEN;
9878 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9879 Set_Node_Offset(ret, parse_start); /* MJD */
9881 *flagp |= POSTPONED;
9882 nextchar(pRExC_state);
9885 assert(0); /* NOT REACHED */
9887 case '?': /* (??...) */
9889 if (*RExC_parse != '{') {
9891 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9893 "Sequence (%"UTF8f"...) not recognized",
9894 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9897 *flagp |= POSTPONED;
9898 paren = *RExC_parse++;
9900 case '{': /* (?{...}) */
9903 struct reg_code_block *cb;
9905 RExC_seen_zerolen++;
9907 if ( !pRExC_state->num_code_blocks
9908 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9909 || pRExC_state->code_blocks[pRExC_state->code_index].start
9910 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9913 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9914 FAIL("panic: Sequence (?{...}): no code block found\n");
9915 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9917 /* this is a pre-compiled code block (?{...}) */
9918 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9919 RExC_parse = RExC_start + cb->end;
9922 if (cb->src_regex) {
9923 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9924 RExC_rxi->data->data[n] =
9925 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9926 RExC_rxi->data->data[n+1] = (void*)o;
9929 n = add_data(pRExC_state,
9930 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9931 RExC_rxi->data->data[n] = (void*)o;
9934 pRExC_state->code_index++;
9935 nextchar(pRExC_state);
9939 ret = reg_node(pRExC_state, LOGICAL);
9940 eval = reganode(pRExC_state, EVAL, n);
9943 /* for later propagation into (??{}) return value */
9944 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9946 REGTAIL(pRExC_state, ret, eval);
9947 /* deal with the length of this later - MJD */
9950 ret = reganode(pRExC_state, EVAL, n);
9951 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9952 Set_Node_Offset(ret, parse_start);
9955 case '(': /* (?(?{...})...) and (?(?=...)...) */
9958 if (RExC_parse[0] == '?') { /* (?(?...)) */
9959 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9960 || RExC_parse[1] == '<'
9961 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9965 ret = reg_node(pRExC_state, LOGICAL);
9969 tail = reg(pRExC_state, 1, &flag, depth+1);
9970 if (flag & RESTART_UTF8) {
9971 *flagp = RESTART_UTF8;
9974 REGTAIL(pRExC_state, ret, tail);
9977 /* Fall through to ‘Unknown switch condition’ at the
9978 end of the if/else chain. */
9980 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9981 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9983 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9984 char *name_start= RExC_parse++;
9986 SV *sv_dat=reg_scan_name(pRExC_state,
9987 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9988 if (RExC_parse == name_start || *RExC_parse != ch)
9989 vFAIL2("Sequence (?(%c... not terminated",
9990 (ch == '>' ? '<' : ch));
9993 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9994 RExC_rxi->data->data[num]=(void*)sv_dat;
9995 SvREFCNT_inc_simple_void(sv_dat);
9997 ret = reganode(pRExC_state,NGROUPP,num);
9998 goto insert_if_check_paren;
10000 else if (RExC_parse[0] == 'D' &&
10001 RExC_parse[1] == 'E' &&
10002 RExC_parse[2] == 'F' &&
10003 RExC_parse[3] == 'I' &&
10004 RExC_parse[4] == 'N' &&
10005 RExC_parse[5] == 'E')
10007 ret = reganode(pRExC_state,DEFINEP,0);
10010 goto insert_if_check_paren;
10012 else if (RExC_parse[0] == 'R') {
10015 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10016 parno = grok_atou(RExC_parse, &endptr);
10018 RExC_parse = (char*)endptr;
10019 } else if (RExC_parse[0] == '&') {
10022 sv_dat = reg_scan_name(pRExC_state,
10024 ? REG_RSN_RETURN_NULL
10025 : REG_RSN_RETURN_DATA);
10026 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10028 ret = reganode(pRExC_state,INSUBP,parno);
10029 goto insert_if_check_paren;
10031 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10035 parno = grok_atou(RExC_parse, &endptr);
10037 RExC_parse = (char*)endptr;
10038 ret = reganode(pRExC_state, GROUPP, parno);
10040 insert_if_check_paren:
10041 if (*(tmp = nextchar(pRExC_state)) != ')') {
10042 /* nextchar also skips comments, so undo its work
10043 * and skip over the the next character.
10046 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10047 vFAIL("Switch condition not recognized");
10050 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10051 br = regbranch(pRExC_state, &flags, 1,depth+1);
10053 if (flags & RESTART_UTF8) {
10054 *flagp = RESTART_UTF8;
10057 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10060 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10062 c = *nextchar(pRExC_state);
10063 if (flags&HASWIDTH)
10064 *flagp |= HASWIDTH;
10067 vFAIL("(?(DEFINE)....) does not allow branches");
10069 /* Fake one for optimizer. */
10070 lastbr = reganode(pRExC_state, IFTHEN, 0);
10072 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10073 if (flags & RESTART_UTF8) {
10074 *flagp = RESTART_UTF8;
10077 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10080 REGTAIL(pRExC_state, ret, lastbr);
10081 if (flags&HASWIDTH)
10082 *flagp |= HASWIDTH;
10083 c = *nextchar(pRExC_state);
10088 vFAIL("Switch (?(condition)... contains too many branches");
10089 ender = reg_node(pRExC_state, TAIL);
10090 REGTAIL(pRExC_state, br, ender);
10092 REGTAIL(pRExC_state, lastbr, ender);
10093 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10096 REGTAIL(pRExC_state, ret, ender);
10097 RExC_size++; /* XXX WHY do we need this?!!
10098 For large programs it seems to be required
10099 but I can't figure out why. -- dmq*/
10102 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10103 vFAIL("Unknown switch condition (?(...))");
10105 case '[': /* (?[ ... ]) */
10106 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10109 RExC_parse--; /* for vFAIL to print correctly */
10110 vFAIL("Sequence (? incomplete");
10112 default: /* e.g., (?i) */
10115 parse_lparen_question_flags(pRExC_state);
10116 if (UCHARAT(RExC_parse) != ':') {
10117 nextchar(pRExC_state);
10122 nextchar(pRExC_state);
10132 ret = reganode(pRExC_state, OPEN, parno);
10134 if (!RExC_nestroot)
10135 RExC_nestroot = parno;
10136 if (RExC_seen & REG_RECURSE_SEEN
10137 && !RExC_open_parens[parno-1])
10139 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10140 "Setting open paren #%"IVdf" to %d\n",
10141 (IV)parno, REG_NODE_NUM(ret)));
10142 RExC_open_parens[parno-1]= ret;
10145 Set_Node_Length(ret, 1); /* MJD */
10146 Set_Node_Offset(ret, RExC_parse); /* MJD */
10154 /* Pick up the branches, linking them together. */
10155 parse_start = RExC_parse; /* MJD */
10156 br = regbranch(pRExC_state, &flags, 1,depth+1);
10158 /* branch_len = (paren != 0); */
10161 if (flags & RESTART_UTF8) {
10162 *flagp = RESTART_UTF8;
10165 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10167 if (*RExC_parse == '|') {
10168 if (!SIZE_ONLY && RExC_extralen) {
10169 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10172 reginsert(pRExC_state, BRANCH, br, depth+1);
10173 Set_Node_Length(br, paren != 0);
10174 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10178 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10180 else if (paren == ':') {
10181 *flagp |= flags&SIMPLE;
10183 if (is_open) { /* Starts with OPEN. */
10184 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10186 else if (paren != '?') /* Not Conditional */
10188 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10190 while (*RExC_parse == '|') {
10191 if (!SIZE_ONLY && RExC_extralen) {
10192 ender = reganode(pRExC_state, LONGJMP,0);
10194 /* Append to the previous. */
10195 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10198 RExC_extralen += 2; /* Account for LONGJMP. */
10199 nextchar(pRExC_state);
10200 if (freeze_paren) {
10201 if (RExC_npar > after_freeze)
10202 after_freeze = RExC_npar;
10203 RExC_npar = freeze_paren;
10205 br = regbranch(pRExC_state, &flags, 0, depth+1);
10208 if (flags & RESTART_UTF8) {
10209 *flagp = RESTART_UTF8;
10212 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10214 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10216 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10219 if (have_branch || paren != ':') {
10220 /* Make a closing node, and hook it on the end. */
10223 ender = reg_node(pRExC_state, TAIL);
10226 ender = reganode(pRExC_state, CLOSE, parno);
10227 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10228 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10229 "Setting close paren #%"IVdf" to %d\n",
10230 (IV)parno, REG_NODE_NUM(ender)));
10231 RExC_close_parens[parno-1]= ender;
10232 if (RExC_nestroot == parno)
10235 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10236 Set_Node_Length(ender,1); /* MJD */
10242 *flagp &= ~HASWIDTH;
10245 ender = reg_node(pRExC_state, SUCCEED);
10248 ender = reg_node(pRExC_state, END);
10250 assert(!RExC_opend); /* there can only be one! */
10251 RExC_opend = ender;
10255 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10256 SV * const mysv_val1=sv_newmortal();
10257 SV * const mysv_val2=sv_newmortal();
10258 DEBUG_PARSE_MSG("lsbr");
10259 regprop(RExC_rx, mysv_val1, lastbr, NULL);
10260 regprop(RExC_rx, mysv_val2, ender, NULL);
10261 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10262 SvPV_nolen_const(mysv_val1),
10263 (IV)REG_NODE_NUM(lastbr),
10264 SvPV_nolen_const(mysv_val2),
10265 (IV)REG_NODE_NUM(ender),
10266 (IV)(ender - lastbr)
10269 REGTAIL(pRExC_state, lastbr, ender);
10271 if (have_branch && !SIZE_ONLY) {
10272 char is_nothing= 1;
10274 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10276 /* Hook the tails of the branches to the closing node. */
10277 for (br = ret; br; br = regnext(br)) {
10278 const U8 op = PL_regkind[OP(br)];
10279 if (op == BRANCH) {
10280 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10281 if ( OP(NEXTOPER(br)) != NOTHING
10282 || regnext(NEXTOPER(br)) != ender)
10285 else if (op == BRANCHJ) {
10286 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10287 /* for now we always disable this optimisation * /
10288 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10289 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10295 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10296 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10297 SV * const mysv_val1=sv_newmortal();
10298 SV * const mysv_val2=sv_newmortal();
10299 DEBUG_PARSE_MSG("NADA");
10300 regprop(RExC_rx, mysv_val1, ret, NULL);
10301 regprop(RExC_rx, mysv_val2, ender, NULL);
10302 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10303 SvPV_nolen_const(mysv_val1),
10304 (IV)REG_NODE_NUM(ret),
10305 SvPV_nolen_const(mysv_val2),
10306 (IV)REG_NODE_NUM(ender),
10311 if (OP(ender) == TAIL) {
10316 for ( opt= br + 1; opt < ender ; opt++ )
10317 OP(opt)= OPTIMIZED;
10318 NEXT_OFF(br)= ender - br;
10326 static const char parens[] = "=!<,>";
10328 if (paren && (p = strchr(parens, paren))) {
10329 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10330 int flag = (p - parens) > 1;
10333 node = SUSPEND, flag = 0;
10334 reginsert(pRExC_state, node,ret, depth+1);
10335 Set_Node_Cur_Length(ret, parse_start);
10336 Set_Node_Offset(ret, parse_start + 1);
10338 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10342 /* Check for proper termination. */
10344 /* restore original flags, but keep (?p) */
10345 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10346 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10347 RExC_parse = oregcomp_parse;
10348 vFAIL("Unmatched (");
10351 else if (!paren && RExC_parse < RExC_end) {
10352 if (*RExC_parse == ')') {
10354 vFAIL("Unmatched )");
10357 FAIL("Junk on end of regexp"); /* "Can't happen". */
10358 assert(0); /* NOTREACHED */
10361 if (RExC_in_lookbehind) {
10362 RExC_in_lookbehind--;
10364 if (after_freeze > RExC_npar)
10365 RExC_npar = after_freeze;
10370 - regbranch - one alternative of an | operator
10372 * Implements the concatenation operator.
10374 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10378 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10381 regnode *chain = NULL;
10383 I32 flags = 0, c = 0;
10384 GET_RE_DEBUG_FLAGS_DECL;
10386 PERL_ARGS_ASSERT_REGBRANCH;
10388 DEBUG_PARSE("brnc");
10393 if (!SIZE_ONLY && RExC_extralen)
10394 ret = reganode(pRExC_state, BRANCHJ,0);
10396 ret = reg_node(pRExC_state, BRANCH);
10397 Set_Node_Length(ret, 1);
10401 if (!first && SIZE_ONLY)
10402 RExC_extralen += 1; /* BRANCHJ */
10404 *flagp = WORST; /* Tentatively. */
10407 nextchar(pRExC_state);
10408 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10409 flags &= ~TRYAGAIN;
10410 latest = regpiece(pRExC_state, &flags,depth+1);
10411 if (latest == NULL) {
10412 if (flags & TRYAGAIN)
10414 if (flags & RESTART_UTF8) {
10415 *flagp = RESTART_UTF8;
10418 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10420 else if (ret == NULL)
10422 *flagp |= flags&(HASWIDTH|POSTPONED);
10423 if (chain == NULL) /* First piece. */
10424 *flagp |= flags&SPSTART;
10427 REGTAIL(pRExC_state, chain, latest);
10432 if (chain == NULL) { /* Loop ran zero times. */
10433 chain = reg_node(pRExC_state, NOTHING);
10438 *flagp |= flags&SIMPLE;
10445 - regpiece - something followed by possible [*+?]
10447 * Note that the branching code sequences used for ? and the general cases
10448 * of * and + are somewhat optimized: they use the same NOTHING node as
10449 * both the endmarker for their branch list and the body of the last branch.
10450 * It might seem that this node could be dispensed with entirely, but the
10451 * endmarker role is not redundant.
10453 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10455 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10459 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10465 const char * const origparse = RExC_parse;
10467 I32 max = REG_INFTY;
10468 #ifdef RE_TRACK_PATTERN_OFFSETS
10471 const char *maxpos = NULL;
10473 /* Save the original in case we change the emitted regop to a FAIL. */
10474 regnode * const orig_emit = RExC_emit;
10476 GET_RE_DEBUG_FLAGS_DECL;
10478 PERL_ARGS_ASSERT_REGPIECE;
10480 DEBUG_PARSE("piec");
10482 ret = regatom(pRExC_state, &flags,depth+1);
10484 if (flags & (TRYAGAIN|RESTART_UTF8))
10485 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10487 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10493 if (op == '{' && regcurly(RExC_parse)) {
10495 #ifdef RE_TRACK_PATTERN_OFFSETS
10496 parse_start = RExC_parse; /* MJD */
10498 next = RExC_parse + 1;
10499 while (isDIGIT(*next) || *next == ',') {
10500 if (*next == ',') {
10508 if (*next == '}') { /* got one */
10509 const char* endptr;
10513 min = grok_atou(RExC_parse, &endptr);
10514 if (*maxpos == ',')
10517 maxpos = RExC_parse;
10518 max = grok_atou(maxpos, &endptr);
10519 if (!max && *maxpos != '0')
10520 max = REG_INFTY; /* meaning "infinity" */
10521 else if (max >= REG_INFTY)
10522 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10524 nextchar(pRExC_state);
10525 if (max < min) { /* If can't match, warn and optimize to fail
10528 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10530 /* We can't back off the size because we have to reserve
10531 * enough space for all the things we are about to throw
10532 * away, but we can shrink it by the ammount we are about
10533 * to re-use here */
10534 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10537 RExC_emit = orig_emit;
10539 ret = reg_node(pRExC_state, OPFAIL);
10542 else if (min == max
10543 && RExC_parse < RExC_end
10544 && (*RExC_parse == '?' || *RExC_parse == '+'))
10547 ckWARN2reg(RExC_parse + 1,
10548 "Useless use of greediness modifier '%c'",
10551 /* Absorb the modifier, so later code doesn't see nor use
10553 nextchar(pRExC_state);
10557 if ((flags&SIMPLE)) {
10558 RExC_naughty += 2 + RExC_naughty / 2;
10559 reginsert(pRExC_state, CURLY, ret, depth+1);
10560 Set_Node_Offset(ret, parse_start+1); /* MJD */
10561 Set_Node_Cur_Length(ret, parse_start);
10564 regnode * const w = reg_node(pRExC_state, WHILEM);
10567 REGTAIL(pRExC_state, ret, w);
10568 if (!SIZE_ONLY && RExC_extralen) {
10569 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10570 reginsert(pRExC_state, NOTHING,ret, depth+1);
10571 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10573 reginsert(pRExC_state, CURLYX,ret, depth+1);
10575 Set_Node_Offset(ret, parse_start+1);
10576 Set_Node_Length(ret,
10577 op == '{' ? (RExC_parse - parse_start) : 1);
10579 if (!SIZE_ONLY && RExC_extralen)
10580 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10581 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10583 RExC_whilem_seen++, RExC_extralen += 3;
10584 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10591 *flagp |= HASWIDTH;
10593 ARG1_SET(ret, (U16)min);
10594 ARG2_SET(ret, (U16)max);
10596 if (max == REG_INFTY)
10597 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10603 if (!ISMULT1(op)) {
10608 #if 0 /* Now runtime fix should be reliable. */
10610 /* if this is reinstated, don't forget to put this back into perldiag:
10612 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10614 (F) The part of the regexp subject to either the * or + quantifier
10615 could match an empty string. The {#} shows in the regular
10616 expression about where the problem was discovered.
10620 if (!(flags&HASWIDTH) && op != '?')
10621 vFAIL("Regexp *+ operand could be empty");
10624 #ifdef RE_TRACK_PATTERN_OFFSETS
10625 parse_start = RExC_parse;
10627 nextchar(pRExC_state);
10629 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10631 if (op == '*' && (flags&SIMPLE)) {
10632 reginsert(pRExC_state, STAR, ret, depth+1);
10635 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10637 else if (op == '*') {
10641 else if (op == '+' && (flags&SIMPLE)) {
10642 reginsert(pRExC_state, PLUS, ret, depth+1);
10645 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10647 else if (op == '+') {
10651 else if (op == '?') {
10656 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10657 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10658 ckWARN2reg(RExC_parse,
10659 "%"UTF8f" matches null string many times",
10660 UTF8fARG(UTF, (RExC_parse >= origparse
10661 ? RExC_parse - origparse
10664 (void)ReREFCNT_inc(RExC_rx_sv);
10667 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10668 nextchar(pRExC_state);
10669 reginsert(pRExC_state, MINMOD, ret, depth+1);
10670 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10673 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10675 nextchar(pRExC_state);
10676 ender = reg_node(pRExC_state, SUCCEED);
10677 REGTAIL(pRExC_state, ret, ender);
10678 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10680 ender = reg_node(pRExC_state, TAIL);
10681 REGTAIL(pRExC_state, ret, ender);
10684 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10686 vFAIL("Nested quantifiers");
10693 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10694 UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10695 const bool strict /* Apply stricter parsing rules? */
10699 /* This is expected to be called by a parser routine that has recognized '\N'
10700 and needs to handle the rest. RExC_parse is expected to point at the first
10701 char following the N at the time of the call. On successful return,
10702 RExC_parse has been updated to point to just after the sequence identified
10703 by this routine, and <*flagp> has been updated.
10705 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10708 \N may begin either a named sequence, or if outside a character class, mean
10709 to match a non-newline. For non single-quoted regexes, the tokenizer has
10710 attempted to decide which, and in the case of a named sequence, converted it
10711 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10712 where c1... are the characters in the sequence. For single-quoted regexes,
10713 the tokenizer passes the \N sequence through unchanged; this code will not
10714 attempt to determine this nor expand those, instead raising a syntax error.
10715 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10716 or there is no '}', it signals that this \N occurrence means to match a
10719 Only the \N{U+...} form should occur in a character class, for the same
10720 reason that '.' inside a character class means to just match a period: it
10721 just doesn't make sense.
10723 The function raises an error (via vFAIL), and doesn't return for various
10724 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10725 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10726 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10727 only possible if node_p is non-NULL.
10730 If <valuep> is non-null, it means the caller can accept an input sequence
10731 consisting of a just a single code point; <*valuep> is set to that value
10732 if the input is such.
10734 If <node_p> is non-null it signifies that the caller can accept any other
10735 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10737 1) \N means not-a-NL: points to a newly created REG_ANY node;
10738 2) \N{}: points to a new NOTHING node;
10739 3) otherwise: points to a new EXACT node containing the resolved
10741 Note that FALSE is returned for single code point sequences if <valuep> is
10745 char * endbrace; /* '}' following the name */
10747 char *endchar; /* Points to '.' or '}' ending cur char in the input
10749 bool has_multiple_chars; /* true if the input stream contains a sequence of
10750 more than one character */
10752 GET_RE_DEBUG_FLAGS_DECL;
10754 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10756 GET_RE_DEBUG_FLAGS;
10758 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10760 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10761 * modifier. The other meaning does not, so use a temporary until we find
10762 * out which we are being called with */
10763 p = (RExC_flags & RXf_PMf_EXTENDED)
10764 ? regpatws(pRExC_state, RExC_parse,
10765 TRUE) /* means recognize comments */
10768 /* Disambiguate between \N meaning a named character versus \N meaning
10769 * [^\n]. The former is assumed when it can't be the latter. */
10770 if (*p != '{' || regcurly(p)) {
10773 /* no bare \N allowed in a charclass */
10774 if (in_char_class) {
10775 vFAIL("\\N in a character class must be a named character: \\N{...}");
10779 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10781 nextchar(pRExC_state);
10782 *node_p = reg_node(pRExC_state, REG_ANY);
10783 *flagp |= HASWIDTH|SIMPLE;
10785 Set_Node_Length(*node_p, 1); /* MJD */
10789 /* Here, we have decided it should be a named character or sequence */
10791 /* The test above made sure that the next real character is a '{', but
10792 * under the /x modifier, it could be separated by space (or a comment and
10793 * \n) and this is not allowed (for consistency with \x{...} and the
10794 * tokenizer handling of \N{NAME}). */
10795 if (*RExC_parse != '{') {
10796 vFAIL("Missing braces on \\N{}");
10799 RExC_parse++; /* Skip past the '{' */
10801 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10802 || ! (endbrace == RExC_parse /* nothing between the {} */
10803 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10805 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10808 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10809 vFAIL("\\N{NAME} must be resolved by the lexer");
10812 if (endbrace == RExC_parse) { /* empty: \N{} */
10815 *node_p = reg_node(pRExC_state,NOTHING);
10817 else if (in_char_class) {
10818 if (SIZE_ONLY && in_char_class) {
10820 RExC_parse++; /* Position after the "}" */
10821 vFAIL("Zero length \\N{}");
10824 ckWARNreg(RExC_parse,
10825 "Ignoring zero length \\N{} in character class");
10833 nextchar(pRExC_state);
10837 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10838 RExC_parse += 2; /* Skip past the 'U+' */
10840 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10842 /* Code points are separated by dots. If none, there is only one code
10843 * point, and is terminated by the brace */
10844 has_multiple_chars = (endchar < endbrace);
10846 if (valuep && (! has_multiple_chars || in_char_class)) {
10847 /* We only pay attention to the first char of
10848 multichar strings being returned in char classes. I kinda wonder
10849 if this makes sense as it does change the behaviour
10850 from earlier versions, OTOH that behaviour was broken
10851 as well. XXX Solution is to recharacterize as
10852 [rest-of-class]|multi1|multi2... */
10854 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10855 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10856 | PERL_SCAN_DISALLOW_PREFIX
10857 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10859 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10861 /* The tokenizer should have guaranteed validity, but it's possible to
10862 * bypass it by using single quoting, so check */
10863 if (length_of_hex == 0
10864 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10866 RExC_parse += length_of_hex; /* Includes all the valid */
10867 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10868 ? UTF8SKIP(RExC_parse)
10870 /* Guard against malformed utf8 */
10871 if (RExC_parse >= endchar) {
10872 RExC_parse = endchar;
10874 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10877 if (in_char_class && has_multiple_chars) {
10879 RExC_parse = endbrace;
10880 vFAIL("\\N{} in character class restricted to one character");
10883 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10887 RExC_parse = endbrace + 1;
10889 else if (! node_p || ! has_multiple_chars) {
10891 /* Here, the input is legal, but not according to the caller's
10892 * options. We fail without advancing the parse, so that the
10893 * caller can try again */
10899 /* What is done here is to convert this to a sub-pattern of the form
10900 * (?:\x{char1}\x{char2}...)
10901 * and then call reg recursively. That way, it retains its atomicness,
10902 * while not having to worry about special handling that some code
10903 * points may have. toke.c has converted the original Unicode values
10904 * to native, so that we can just pass on the hex values unchanged. We
10905 * do have to set a flag to keep recoding from happening in the
10908 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10910 char *orig_end = RExC_end;
10913 while (RExC_parse < endbrace) {
10915 /* Convert to notation the rest of the code understands */
10916 sv_catpv(substitute_parse, "\\x{");
10917 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10918 sv_catpv(substitute_parse, "}");
10920 /* Point to the beginning of the next character in the sequence. */
10921 RExC_parse = endchar + 1;
10922 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10924 sv_catpv(substitute_parse, ")");
10926 RExC_parse = SvPV(substitute_parse, len);
10928 /* Don't allow empty number */
10930 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10932 RExC_end = RExC_parse + len;
10934 /* The values are Unicode, and therefore not subject to recoding */
10935 RExC_override_recoding = 1;
10937 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10938 if (flags & RESTART_UTF8) {
10939 *flagp = RESTART_UTF8;
10942 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10945 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10947 RExC_parse = endbrace;
10948 RExC_end = orig_end;
10949 RExC_override_recoding = 0;
10951 nextchar(pRExC_state);
10961 * It returns the code point in utf8 for the value in *encp.
10962 * value: a code value in the source encoding
10963 * encp: a pointer to an Encode object
10965 * If the result from Encode is not a single character,
10966 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10969 S_reg_recode(pTHX_ const char value, SV **encp)
10972 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10973 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10974 const STRLEN newlen = SvCUR(sv);
10975 UV uv = UNICODE_REPLACEMENT;
10977 PERL_ARGS_ASSERT_REG_RECODE;
10981 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10984 if (!newlen || numlen != newlen) {
10985 uv = UNICODE_REPLACEMENT;
10991 PERL_STATIC_INLINE U8
10992 S_compute_EXACTish(RExC_state_t *pRExC_state)
10996 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11002 op = get_regex_charset(RExC_flags);
11003 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11004 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11005 been, so there is no hole */
11008 return op + EXACTF;
11011 PERL_STATIC_INLINE void
11012 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11013 regnode *node, I32* flagp, STRLEN len, UV code_point,
11016 /* This knows the details about sizing an EXACTish node, setting flags for
11017 * it (by setting <*flagp>, and potentially populating it with a single
11020 * If <len> (the length in bytes) is non-zero, this function assumes that
11021 * the node has already been populated, and just does the sizing. In this
11022 * case <code_point> should be the final code point that has already been
11023 * placed into the node. This value will be ignored except that under some
11024 * circumstances <*flagp> is set based on it.
11026 * If <len> is zero, the function assumes that the node is to contain only
11027 * the single character given by <code_point> and calculates what <len>
11028 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11029 * additionally will populate the node's STRING with <code_point> or its
11032 * In both cases <*flagp> is appropriately set
11034 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11035 * 255, must be folded (the former only when the rules indicate it can
11038 * When it does the populating, it looks at the flag 'downgradable'. If
11039 * true with a node that folds, it checks if the single code point
11040 * participates in a fold, and if not downgrades the node to an EXACT.
11041 * This helps the optimizer */
11043 bool len_passed_in = cBOOL(len != 0);
11044 U8 character[UTF8_MAXBYTES_CASE+1];
11046 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11048 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11049 * sizing difference, and is extra work that is thrown away */
11050 if (downgradable && ! PASS2) {
11051 downgradable = FALSE;
11054 if (! len_passed_in) {
11056 if (UNI_IS_INVARIANT(code_point)) {
11057 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11058 *character = (U8) code_point;
11060 else { /* Here is /i and not /l (toFOLD() is defined on just
11061 ASCII, which isn't the same thing as INVARIANT on
11062 EBCDIC, but it works there, as the extra invariants
11063 fold to themselves) */
11064 *character = toFOLD((U8) code_point);
11066 && *character == code_point
11067 && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11074 else if (FOLD && (! LOC
11075 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11076 { /* Folding, and ok to do so now */
11077 UV folded = _to_uni_fold_flags(
11081 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11082 ? FOLD_FLAGS_NOMIX_ASCII
11085 && folded == code_point
11086 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11091 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11093 /* Not folding this cp, and can output it directly */
11094 *character = UTF8_TWO_BYTE_HI(code_point);
11095 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11099 uvchr_to_utf8( character, code_point);
11100 len = UTF8SKIP(character);
11102 } /* Else pattern isn't UTF8. */
11104 *character = (U8) code_point;
11106 } /* Else is folded non-UTF8 */
11107 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11109 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11110 * comments at join_exact()); */
11111 *character = (U8) code_point;
11114 /* Can turn into an EXACT node if we know the fold at compile time,
11115 * and it folds to itself and doesn't particpate in other folds */
11118 && PL_fold_latin1[code_point] == code_point
11119 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11120 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11124 } /* else is Sharp s. May need to fold it */
11125 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11127 *(character + 1) = 's';
11131 *character = LATIN_SMALL_LETTER_SHARP_S;
11137 RExC_size += STR_SZ(len);
11140 RExC_emit += STR_SZ(len);
11141 STR_LEN(node) = len;
11142 if (! len_passed_in) {
11143 Copy((char *) character, STRING(node), len, char);
11147 *flagp |= HASWIDTH;
11149 /* A single character node is SIMPLE, except for the special-cased SHARP S
11151 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11152 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11153 || ! FOLD || ! DEPENDS_SEMANTICS))
11158 /* The OP may not be well defined in PASS1 */
11159 if (PASS2 && OP(node) == EXACTFL) {
11160 RExC_contains_locale = 1;
11165 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11166 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11169 S_backref_value(char *p)
11171 const char* endptr;
11172 UV val = grok_atou(p, &endptr);
11173 if (endptr == p || endptr == NULL || val > I32_MAX)
11180 - regatom - the lowest level
11182 Try to identify anything special at the start of the pattern. If there
11183 is, then handle it as required. This may involve generating a single regop,
11184 such as for an assertion; or it may involve recursing, such as to
11185 handle a () structure.
11187 If the string doesn't start with something special then we gobble up
11188 as much literal text as we can.
11190 Once we have been able to handle whatever type of thing started the
11191 sequence, we return.
11193 Note: we have to be careful with escapes, as they can be both literal
11194 and special, and in the case of \10 and friends, context determines which.
11196 A summary of the code structure is:
11198 switch (first_byte) {
11199 cases for each special:
11200 handle this special;
11203 switch (2nd byte) {
11204 cases for each unambiguous special:
11205 handle this special;
11207 cases for each ambigous special/literal:
11209 if (special) handle here
11211 default: // unambiguously literal:
11214 default: // is a literal char
11217 create EXACTish node for literal;
11218 while (more input and node isn't full) {
11219 switch (input_byte) {
11220 cases for each special;
11221 make sure parse pointer is set so that the next call to
11222 regatom will see this special first
11223 goto loopdone; // EXACTish node terminated by prev. char
11225 append char to EXACTISH node;
11227 get next input byte;
11231 return the generated node;
11233 Specifically there are two separate switches for handling
11234 escape sequences, with the one for handling literal escapes requiring
11235 a dummy entry for all of the special escapes that are actually handled
11238 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11240 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11242 Otherwise does not return NULL.
11246 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11248 regnode *ret = NULL;
11250 char *parse_start = RExC_parse;
11255 GET_RE_DEBUG_FLAGS_DECL;
11257 *flagp = WORST; /* Tentatively. */
11259 DEBUG_PARSE("atom");
11261 PERL_ARGS_ASSERT_REGATOM;
11264 switch ((U8)*RExC_parse) {
11266 RExC_seen_zerolen++;
11267 nextchar(pRExC_state);
11268 if (RExC_flags & RXf_PMf_MULTILINE)
11269 ret = reg_node(pRExC_state, MBOL);
11270 else if (RExC_flags & RXf_PMf_SINGLELINE)
11271 ret = reg_node(pRExC_state, SBOL);
11273 ret = reg_node(pRExC_state, BOL);
11274 Set_Node_Length(ret, 1); /* MJD */
11277 nextchar(pRExC_state);
11279 RExC_seen_zerolen++;
11280 if (RExC_flags & RXf_PMf_MULTILINE)
11281 ret = reg_node(pRExC_state, MEOL);
11282 else if (RExC_flags & RXf_PMf_SINGLELINE)
11283 ret = reg_node(pRExC_state, SEOL);
11285 ret = reg_node(pRExC_state, EOL);
11286 Set_Node_Length(ret, 1); /* MJD */
11289 nextchar(pRExC_state);
11290 if (RExC_flags & RXf_PMf_SINGLELINE)
11291 ret = reg_node(pRExC_state, SANY);
11293 ret = reg_node(pRExC_state, REG_ANY);
11294 *flagp |= HASWIDTH|SIMPLE;
11296 Set_Node_Length(ret, 1); /* MJD */
11300 char * const oregcomp_parse = ++RExC_parse;
11301 ret = regclass(pRExC_state, flagp,depth+1,
11302 FALSE, /* means parse the whole char class */
11303 TRUE, /* allow multi-char folds */
11304 FALSE, /* don't silence non-portable warnings. */
11306 if (*RExC_parse != ']') {
11307 RExC_parse = oregcomp_parse;
11308 vFAIL("Unmatched [");
11311 if (*flagp & RESTART_UTF8)
11313 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11316 nextchar(pRExC_state);
11317 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11321 nextchar(pRExC_state);
11322 ret = reg(pRExC_state, 2, &flags,depth+1);
11324 if (flags & TRYAGAIN) {
11325 if (RExC_parse == RExC_end) {
11326 /* Make parent create an empty node if needed. */
11327 *flagp |= TRYAGAIN;
11332 if (flags & RESTART_UTF8) {
11333 *flagp = RESTART_UTF8;
11336 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11339 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11343 if (flags & TRYAGAIN) {
11344 *flagp |= TRYAGAIN;
11347 vFAIL("Internal urp");
11348 /* Supposed to be caught earlier. */
11354 vFAIL("Quantifier follows nothing");
11359 This switch handles escape sequences that resolve to some kind
11360 of special regop and not to literal text. Escape sequnces that
11361 resolve to literal text are handled below in the switch marked
11364 Every entry in this switch *must* have a corresponding entry
11365 in the literal escape switch. However, the opposite is not
11366 required, as the default for this switch is to jump to the
11367 literal text handling code.
11369 switch ((U8)*++RExC_parse) {
11370 /* Special Escapes */
11372 RExC_seen_zerolen++;
11373 ret = reg_node(pRExC_state, SBOL);
11375 goto finish_meta_pat;
11377 ret = reg_node(pRExC_state, GPOS);
11378 RExC_seen |= REG_GPOS_SEEN;
11380 goto finish_meta_pat;
11382 RExC_seen_zerolen++;
11383 ret = reg_node(pRExC_state, KEEPS);
11385 /* XXX:dmq : disabling in-place substitution seems to
11386 * be necessary here to avoid cases of memory corruption, as
11387 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11389 RExC_seen |= REG_LOOKBEHIND_SEEN;
11390 goto finish_meta_pat;
11392 ret = reg_node(pRExC_state, SEOL);
11394 RExC_seen_zerolen++; /* Do not optimize RE away */
11395 goto finish_meta_pat;
11397 ret = reg_node(pRExC_state, EOS);
11399 RExC_seen_zerolen++; /* Do not optimize RE away */
11400 goto finish_meta_pat;
11402 ret = reg_node(pRExC_state, CANY);
11403 RExC_seen |= REG_CANY_SEEN;
11404 *flagp |= HASWIDTH|SIMPLE;
11406 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11408 goto finish_meta_pat;
11410 ret = reg_node(pRExC_state, CLUMP);
11411 *flagp |= HASWIDTH;
11412 goto finish_meta_pat;
11418 arg = ANYOF_WORDCHAR;
11422 RExC_seen_zerolen++;
11423 RExC_seen |= REG_LOOKBEHIND_SEEN;
11424 op = BOUND + get_regex_charset(RExC_flags);
11425 if (op > BOUNDA) { /* /aa is same as /a */
11428 else if (op == BOUNDL) {
11429 RExC_contains_locale = 1;
11431 ret = reg_node(pRExC_state, op);
11432 FLAGS(ret) = get_regex_charset(RExC_flags);
11434 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11435 /* diag_listed_as: Use "%s" instead of "%s" */
11436 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11438 goto finish_meta_pat;
11440 RExC_seen_zerolen++;
11441 RExC_seen |= REG_LOOKBEHIND_SEEN;
11442 op = NBOUND + get_regex_charset(RExC_flags);
11443 if (op > NBOUNDA) { /* /aa is same as /a */
11446 else if (op == NBOUNDL) {
11447 RExC_contains_locale = 1;
11449 ret = reg_node(pRExC_state, op);
11450 FLAGS(ret) = get_regex_charset(RExC_flags);
11452 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11453 /* diag_listed_as: Use "%s" instead of "%s" */
11454 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11456 goto finish_meta_pat;
11466 ret = reg_node(pRExC_state, LNBREAK);
11467 *flagp |= HASWIDTH|SIMPLE;
11468 goto finish_meta_pat;
11476 goto join_posix_op_known;
11482 arg = ANYOF_VERTWS;
11484 goto join_posix_op_known;
11494 op = POSIXD + get_regex_charset(RExC_flags);
11495 if (op > POSIXA) { /* /aa is same as /a */
11498 else if (op == POSIXL) {
11499 RExC_contains_locale = 1;
11502 join_posix_op_known:
11505 op += NPOSIXD - POSIXD;
11508 ret = reg_node(pRExC_state, op);
11510 FLAGS(ret) = namedclass_to_classnum(arg);
11513 *flagp |= HASWIDTH|SIMPLE;
11517 nextchar(pRExC_state);
11518 Set_Node_Length(ret, 2); /* MJD */
11524 char* parse_start = RExC_parse - 2;
11529 ret = regclass(pRExC_state, flagp,depth+1,
11530 TRUE, /* means just parse this element */
11531 FALSE, /* don't allow multi-char folds */
11532 FALSE, /* don't silence non-portable warnings.
11533 It would be a bug if these returned
11536 /* regclass() can only return RESTART_UTF8 if multi-char folds
11539 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11544 Set_Node_Offset(ret, parse_start + 2);
11545 Set_Node_Cur_Length(ret, parse_start);
11546 nextchar(pRExC_state);
11550 /* Handle \N and \N{NAME} with multiple code points here and not
11551 * below because it can be multicharacter. join_exact() will join
11552 * them up later on. Also this makes sure that things like
11553 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11554 * The options to the grok function call causes it to fail if the
11555 * sequence is just a single code point. We then go treat it as
11556 * just another character in the current EXACT node, and hence it
11557 * gets uniform treatment with all the other characters. The
11558 * special treatment for quantifiers is not needed for such single
11559 * character sequences */
11561 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11562 FALSE /* not strict */ )) {
11563 if (*flagp & RESTART_UTF8)
11569 case 'k': /* Handle \k<NAME> and \k'NAME' */
11572 char ch= RExC_parse[1];
11573 if (ch != '<' && ch != '\'' && ch != '{') {
11575 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11576 vFAIL2("Sequence %.2s... not terminated",parse_start);
11578 /* this pretty much dupes the code for (?P=...) in reg(), if
11579 you change this make sure you change that */
11580 char* name_start = (RExC_parse += 2);
11582 SV *sv_dat = reg_scan_name(pRExC_state,
11583 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11584 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11585 if (RExC_parse == name_start || *RExC_parse != ch)
11586 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11587 vFAIL2("Sequence %.3s... not terminated",parse_start);
11590 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11591 RExC_rxi->data->data[num]=(void*)sv_dat;
11592 SvREFCNT_inc_simple_void(sv_dat);
11596 ret = reganode(pRExC_state,
11599 : (ASCII_FOLD_RESTRICTED)
11601 : (AT_LEAST_UNI_SEMANTICS)
11607 *flagp |= HASWIDTH;
11609 /* override incorrect value set in reganode MJD */
11610 Set_Node_Offset(ret, parse_start+1);
11611 Set_Node_Cur_Length(ret, parse_start);
11612 nextchar(pRExC_state);
11618 case '1': case '2': case '3': case '4':
11619 case '5': case '6': case '7': case '8': case '9':
11624 if (*RExC_parse == 'g') {
11628 if (*RExC_parse == '{') {
11632 if (*RExC_parse == '-') {
11636 if (hasbrace && !isDIGIT(*RExC_parse)) {
11637 if (isrel) RExC_parse--;
11639 goto parse_named_seq;
11642 num = S_backref_value(RExC_parse);
11644 vFAIL("Reference to invalid group 0");
11645 else if (num == I32_MAX) {
11646 if (isDIGIT(*RExC_parse))
11647 vFAIL("Reference to nonexistent group");
11649 vFAIL("Unterminated \\g... pattern");
11653 num = RExC_npar - num;
11655 vFAIL("Reference to nonexistent or unclosed group");
11659 num = S_backref_value(RExC_parse);
11660 /* bare \NNN might be backref or octal - if it is larger than or equal
11661 * RExC_npar then it is assumed to be and octal escape.
11662 * Note RExC_npar is +1 from the actual number of parens*/
11663 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11664 && *RExC_parse != '8' && *RExC_parse != '9'))
11666 /* Probably a character specified in octal, e.g. \35 */
11671 /* at this point RExC_parse definitely points to a backref
11674 #ifdef RE_TRACK_PATTERN_OFFSETS
11675 char * const parse_start = RExC_parse - 1; /* MJD */
11677 while (isDIGIT(*RExC_parse))
11680 if (*RExC_parse != '}')
11681 vFAIL("Unterminated \\g{...} pattern");
11685 if (num > (I32)RExC_rx->nparens)
11686 vFAIL("Reference to nonexistent group");
11689 ret = reganode(pRExC_state,
11692 : (ASCII_FOLD_RESTRICTED)
11694 : (AT_LEAST_UNI_SEMANTICS)
11700 *flagp |= HASWIDTH;
11702 /* override incorrect value set in reganode MJD */
11703 Set_Node_Offset(ret, parse_start+1);
11704 Set_Node_Cur_Length(ret, parse_start);
11706 nextchar(pRExC_state);
11711 if (RExC_parse >= RExC_end)
11712 FAIL("Trailing \\");
11715 /* Do not generate "unrecognized" warnings here, we fall
11716 back into the quick-grab loop below */
11723 if (RExC_flags & RXf_PMf_EXTENDED) {
11724 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11725 if (RExC_parse < RExC_end)
11732 parse_start = RExC_parse - 1;
11741 #define MAX_NODE_STRING_SIZE 127
11742 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11744 U8 upper_parse = MAX_NODE_STRING_SIZE;
11745 U8 node_type = compute_EXACTish(pRExC_state);
11746 bool next_is_quantifier;
11747 char * oldp = NULL;
11749 /* We can convert EXACTF nodes to EXACTFU if they contain only
11750 * characters that match identically regardless of the target
11751 * string's UTF8ness. The reason to do this is that EXACTF is not
11752 * trie-able, EXACTFU is.
11754 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11755 * contain only above-Latin1 characters (hence must be in UTF8),
11756 * which don't participate in folds with Latin1-range characters,
11757 * as the latter's folds aren't known until runtime. (We don't
11758 * need to figure this out until pass 2) */
11759 bool maybe_exactfu = PASS2
11760 && (node_type == EXACTF || node_type == EXACTFL);
11762 /* If a folding node contains only code points that don't
11763 * participate in folds, it can be changed into an EXACT node,
11764 * which allows the optimizer more things to look for */
11767 ret = reg_node(pRExC_state, node_type);
11769 /* In pass1, folded, we use a temporary buffer instead of the
11770 * actual node, as the node doesn't exist yet */
11771 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11777 /* We do the EXACTFish to EXACT node only if folding. (And we
11778 * don't need to figure this out until pass 2) */
11779 maybe_exact = FOLD && PASS2;
11781 /* XXX The node can hold up to 255 bytes, yet this only goes to
11782 * 127. I (khw) do not know why. Keeping it somewhat less than
11783 * 255 allows us to not have to worry about overflow due to
11784 * converting to utf8 and fold expansion, but that value is
11785 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11786 * split up by this limit into a single one using the real max of
11787 * 255. Even at 127, this breaks under rare circumstances. If
11788 * folding, we do not want to split a node at a character that is a
11789 * non-final in a multi-char fold, as an input string could just
11790 * happen to want to match across the node boundary. The join
11791 * would solve that problem if the join actually happens. But a
11792 * series of more than two nodes in a row each of 127 would cause
11793 * the first join to succeed to get to 254, but then there wouldn't
11794 * be room for the next one, which could at be one of those split
11795 * multi-char folds. I don't know of any fool-proof solution. One
11796 * could back off to end with only a code point that isn't such a
11797 * non-final, but it is possible for there not to be any in the
11799 for (p = RExC_parse - 1;
11800 len < upper_parse && p < RExC_end;
11805 if (RExC_flags & RXf_PMf_EXTENDED)
11806 p = regpatws(pRExC_state, p,
11807 TRUE); /* means recognize comments */
11818 /* Literal Escapes Switch
11820 This switch is meant to handle escape sequences that
11821 resolve to a literal character.
11823 Every escape sequence that represents something
11824 else, like an assertion or a char class, is handled
11825 in the switch marked 'Special Escapes' above in this
11826 routine, but also has an entry here as anything that
11827 isn't explicitly mentioned here will be treated as
11828 an unescaped equivalent literal.
11831 switch ((U8)*++p) {
11832 /* These are all the special escapes. */
11833 case 'A': /* Start assertion */
11834 case 'b': case 'B': /* Word-boundary assertion*/
11835 case 'C': /* Single char !DANGEROUS! */
11836 case 'd': case 'D': /* digit class */
11837 case 'g': case 'G': /* generic-backref, pos assertion */
11838 case 'h': case 'H': /* HORIZWS */
11839 case 'k': case 'K': /* named backref, keep marker */
11840 case 'p': case 'P': /* Unicode property */
11841 case 'R': /* LNBREAK */
11842 case 's': case 'S': /* space class */
11843 case 'v': case 'V': /* VERTWS */
11844 case 'w': case 'W': /* word class */
11845 case 'X': /* eXtended Unicode "combining
11846 character sequence" */
11847 case 'z': case 'Z': /* End of line/string assertion */
11851 /* Anything after here is an escape that resolves to a
11852 literal. (Except digits, which may or may not)
11858 case 'N': /* Handle a single-code point named character. */
11859 /* The options cause it to fail if a multiple code
11860 * point sequence. Handle those in the switch() above
11862 RExC_parse = p + 1;
11863 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11864 flagp, depth, FALSE,
11865 FALSE /* not strict */ ))
11867 if (*flagp & RESTART_UTF8)
11868 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11869 RExC_parse = p = oldp;
11873 if (ender > 0xff) {
11890 ender = ASCII_TO_NATIVE('\033');
11900 const char* error_msg;
11902 bool valid = grok_bslash_o(&p,
11905 TRUE, /* out warnings */
11906 FALSE, /* not strict */
11907 TRUE, /* Output warnings
11912 RExC_parse = p; /* going to die anyway; point
11913 to exact spot of failure */
11917 if (PL_encoding && ender < 0x100) {
11918 goto recode_encoding;
11920 if (ender > 0xff) {
11927 UV result = UV_MAX; /* initialize to erroneous
11929 const char* error_msg;
11931 bool valid = grok_bslash_x(&p,
11934 TRUE, /* out warnings */
11935 FALSE, /* not strict */
11936 TRUE, /* Output warnings
11941 RExC_parse = p; /* going to die anyway; point
11942 to exact spot of failure */
11947 if (PL_encoding && ender < 0x100) {
11948 goto recode_encoding;
11950 if (ender > 0xff) {
11957 ender = grok_bslash_c(*p++, SIZE_ONLY);
11959 case '8': case '9': /* must be a backreference */
11962 case '1': case '2': case '3':case '4':
11963 case '5': case '6': case '7':
11964 /* When we parse backslash escapes there is ambiguity
11965 * between backreferences and octal escapes. Any escape
11966 * from \1 - \9 is a backreference, any multi-digit
11967 * escape which does not start with 0 and which when
11968 * evaluated as decimal could refer to an already
11969 * parsed capture buffer is a backslash. Anything else
11972 * Note this implies that \118 could be interpreted as
11973 * 118 OR as "\11" . "8" depending on whether there
11974 * were 118 capture buffers defined already in the
11977 /* NOTE, RExC_npar is 1 more than the actual number of
11978 * parens we have seen so far, hence the < RExC_npar below. */
11980 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11981 { /* Not to be treated as an octal constant, go
11989 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11991 ender = grok_oct(p, &numlen, &flags, NULL);
11992 if (ender > 0xff) {
11996 if (SIZE_ONLY /* like \08, \178 */
11999 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12001 reg_warn_non_literal_string(
12003 form_short_octal_warning(p, numlen));
12006 if (PL_encoding && ender < 0x100)
12007 goto recode_encoding;
12010 if (! RExC_override_recoding) {
12011 SV* enc = PL_encoding;
12012 ender = reg_recode((const char)(U8)ender, &enc);
12013 if (!enc && SIZE_ONLY)
12014 ckWARNreg(p, "Invalid escape in the specified encoding");
12020 FAIL("Trailing \\");
12023 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12024 /* Include any { following the alpha to emphasize
12025 * that it could be part of an escape at some point
12027 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12028 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12030 goto normal_default;
12031 } /* End of switch on '\' */
12034 /* Currently we don't warn when the lbrace is at the start
12035 * of a construct. This catches it in the middle of a
12036 * literal string, or when its the first thing after
12037 * something like "\b" */
12039 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12041 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12044 default: /* A literal character */
12046 if (UTF8_IS_START(*p) && UTF) {
12048 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12049 &numlen, UTF8_ALLOW_DEFAULT);
12055 } /* End of switch on the literal */
12057 /* Here, have looked at the literal character and <ender>
12058 * contains its ordinal, <p> points to the character after it
12061 if ( RExC_flags & RXf_PMf_EXTENDED)
12062 p = regpatws(pRExC_state, p,
12063 TRUE); /* means recognize comments */
12065 /* If the next thing is a quantifier, it applies to this
12066 * character only, which means that this character has to be in
12067 * its own node and can't just be appended to the string in an
12068 * existing node, so if there are already other characters in
12069 * the node, close the node with just them, and set up to do
12070 * this character again next time through, when it will be the
12071 * only thing in its new node */
12072 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12078 if (! FOLD /* The simple case, just append the literal */
12079 || (LOC /* Also don't fold for tricky chars under /l */
12080 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12083 const STRLEN unilen = reguni(pRExC_state, ender, s);
12089 /* The loop increments <len> each time, as all but this
12090 * path (and one other) through it add a single byte to
12091 * the EXACTish node. But this one has changed len to
12092 * be the correct final value, so subtract one to
12093 * cancel out the increment that follows */
12097 REGC((char)ender, s++);
12100 /* Can get here if folding only if is one of the /l
12101 * characters whose fold depends on the locale. The
12102 * occurrence of any of these indicate that we can't
12103 * simplify things */
12105 maybe_exact = FALSE;
12106 maybe_exactfu = FALSE;
12111 /* See comments for join_exact() as to why we fold this
12112 * non-UTF at compile time */
12113 || (node_type == EXACTFU
12114 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12116 /* Here, are folding and are not UTF-8 encoded; therefore
12117 * the character must be in the range 0-255, and is not /l
12118 * (Not /l because we already handled these under /l in
12119 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12120 if (IS_IN_SOME_FOLD_L1(ender)) {
12121 maybe_exact = FALSE;
12123 /* See if the character's fold differs between /d and
12124 * /u. This includes the multi-char fold SHARP S to
12127 && (PL_fold[ender] != PL_fold_latin1[ender]
12128 || ender == LATIN_SMALL_LETTER_SHARP_S
12130 && isARG2_lower_or_UPPER_ARG1('s', ender)
12131 && isARG2_lower_or_UPPER_ARG1('s',
12134 maybe_exactfu = FALSE;
12138 /* Even when folding, we store just the input character, as
12139 * we have an array that finds its fold quickly */
12140 *(s++) = (char) ender;
12142 else { /* FOLD and UTF */
12143 /* Unlike the non-fold case, we do actually have to
12144 * calculate the results here in pass 1. This is for two
12145 * reasons, the folded length may be longer than the
12146 * unfolded, and we have to calculate how many EXACTish
12147 * nodes it will take; and we may run out of room in a node
12148 * in the middle of a potential multi-char fold, and have
12149 * to back off accordingly. (Hence we can't use REGC for
12150 * the simple case just below.) */
12153 if (isASCII(ender)) {
12154 folded = toFOLD(ender);
12155 *(s)++ = (U8) folded;
12160 folded = _to_uni_fold_flags(
12164 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12165 ? FOLD_FLAGS_NOMIX_ASCII
12169 /* The loop increments <len> each time, as all but this
12170 * path (and one other) through it add a single byte to
12171 * the EXACTish node. But this one has changed len to
12172 * be the correct final value, so subtract one to
12173 * cancel out the increment that follows */
12174 len += foldlen - 1;
12176 /* If this node only contains non-folding code points so
12177 * far, see if this new one is also non-folding */
12179 if (folded != ender) {
12180 maybe_exact = FALSE;
12183 /* Here the fold is the original; we have to check
12184 * further to see if anything folds to it */
12185 if (_invlist_contains_cp(PL_utf8_foldable,
12188 maybe_exact = FALSE;
12195 if (next_is_quantifier) {
12197 /* Here, the next input is a quantifier, and to get here,
12198 * the current character is the only one in the node.
12199 * Also, here <len> doesn't include the final byte for this
12205 } /* End of loop through literal characters */
12207 /* Here we have either exhausted the input or ran out of room in
12208 * the node. (If we encountered a character that can't be in the
12209 * node, transfer is made directly to <loopdone>, and so we
12210 * wouldn't have fallen off the end of the loop.) In the latter
12211 * case, we artificially have to split the node into two, because
12212 * we just don't have enough space to hold everything. This
12213 * creates a problem if the final character participates in a
12214 * multi-character fold in the non-final position, as a match that
12215 * should have occurred won't, due to the way nodes are matched,
12216 * and our artificial boundary. So back off until we find a non-
12217 * problematic character -- one that isn't at the beginning or
12218 * middle of such a fold. (Either it doesn't participate in any
12219 * folds, or appears only in the final position of all the folds it
12220 * does participate in.) A better solution with far fewer false
12221 * positives, and that would fill the nodes more completely, would
12222 * be to actually have available all the multi-character folds to
12223 * test against, and to back-off only far enough to be sure that
12224 * this node isn't ending with a partial one. <upper_parse> is set
12225 * further below (if we need to reparse the node) to include just
12226 * up through that final non-problematic character that this code
12227 * identifies, so when it is set to less than the full node, we can
12228 * skip the rest of this */
12229 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12231 const STRLEN full_len = len;
12233 assert(len >= MAX_NODE_STRING_SIZE);
12235 /* Here, <s> points to the final byte of the final character.
12236 * Look backwards through the string until find a non-
12237 * problematic character */
12241 /* This has no multi-char folds to non-UTF characters */
12242 if (ASCII_FOLD_RESTRICTED) {
12246 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12250 if (! PL_NonL1NonFinalFold) {
12251 PL_NonL1NonFinalFold = _new_invlist_C_array(
12252 NonL1_Perl_Non_Final_Folds_invlist);
12255 /* Point to the first byte of the final character */
12256 s = (char *) utf8_hop((U8 *) s, -1);
12258 while (s >= s0) { /* Search backwards until find
12259 non-problematic char */
12260 if (UTF8_IS_INVARIANT(*s)) {
12262 /* There are no ascii characters that participate
12263 * in multi-char folds under /aa. In EBCDIC, the
12264 * non-ascii invariants are all control characters,
12265 * so don't ever participate in any folds. */
12266 if (ASCII_FOLD_RESTRICTED
12267 || ! IS_NON_FINAL_FOLD(*s))
12272 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12273 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12279 else if (! _invlist_contains_cp(
12280 PL_NonL1NonFinalFold,
12281 valid_utf8_to_uvchr((U8 *) s, NULL)))
12286 /* Here, the current character is problematic in that
12287 * it does occur in the non-final position of some
12288 * fold, so try the character before it, but have to
12289 * special case the very first byte in the string, so
12290 * we don't read outside the string */
12291 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12292 } /* End of loop backwards through the string */
12294 /* If there were only problematic characters in the string,
12295 * <s> will point to before s0, in which case the length
12296 * should be 0, otherwise include the length of the
12297 * non-problematic character just found */
12298 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12301 /* Here, have found the final character, if any, that is
12302 * non-problematic as far as ending the node without splitting
12303 * it across a potential multi-char fold. <len> contains the
12304 * number of bytes in the node up-to and including that
12305 * character, or is 0 if there is no such character, meaning
12306 * the whole node contains only problematic characters. In
12307 * this case, give up and just take the node as-is. We can't
12312 /* If the node ends in an 's' we make sure it stays EXACTF,
12313 * as if it turns into an EXACTFU, it could later get
12314 * joined with another 's' that would then wrongly match
12316 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12318 maybe_exactfu = FALSE;
12322 /* Here, the node does contain some characters that aren't
12323 * problematic. If one such is the final character in the
12324 * node, we are done */
12325 if (len == full_len) {
12328 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12330 /* If the final character is problematic, but the
12331 * penultimate is not, back-off that last character to
12332 * later start a new node with it */
12337 /* Here, the final non-problematic character is earlier
12338 * in the input than the penultimate character. What we do
12339 * is reparse from the beginning, going up only as far as
12340 * this final ok one, thus guaranteeing that the node ends
12341 * in an acceptable character. The reason we reparse is
12342 * that we know how far in the character is, but we don't
12343 * know how to correlate its position with the input parse.
12344 * An alternate implementation would be to build that
12345 * correlation as we go along during the original parse,
12346 * but that would entail extra work for every node, whereas
12347 * this code gets executed only when the string is too
12348 * large for the node, and the final two characters are
12349 * problematic, an infrequent occurrence. Yet another
12350 * possible strategy would be to save the tail of the
12351 * string, and the next time regatom is called, initialize
12352 * with that. The problem with this is that unless you
12353 * back off one more character, you won't be guaranteed
12354 * regatom will get called again, unless regbranch,
12355 * regpiece ... are also changed. If you do back off that
12356 * extra character, so that there is input guaranteed to
12357 * force calling regatom, you can't handle the case where
12358 * just the first character in the node is acceptable. I
12359 * (khw) decided to try this method which doesn't have that
12360 * pitfall; if performance issues are found, we can do a
12361 * combination of the current approach plus that one */
12367 } /* End of verifying node ends with an appropriate char */
12369 loopdone: /* Jumped to when encounters something that shouldn't be in
12372 /* I (khw) don't know if you can get here with zero length, but the
12373 * old code handled this situation by creating a zero-length EXACT
12374 * node. Might as well be NOTHING instead */
12380 /* If 'maybe_exact' is still set here, means there are no
12381 * code points in the node that participate in folds;
12382 * similarly for 'maybe_exactfu' and code points that match
12383 * differently depending on UTF8ness of the target string
12384 * (for /u), or depending on locale for /l */
12388 else if (maybe_exactfu) {
12392 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12393 FALSE /* Don't look to see if could
12394 be turned into an EXACT
12395 node, as we have already
12400 RExC_parse = p - 1;
12401 Set_Node_Cur_Length(ret, parse_start);
12402 nextchar(pRExC_state);
12404 /* len is STRLEN which is unsigned, need to copy to signed */
12407 vFAIL("Internal disaster");
12410 } /* End of label 'defchar:' */
12412 } /* End of giant switch on input character */
12418 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12420 /* Returns the next non-pattern-white space, non-comment character (the
12421 * latter only if 'recognize_comment is true) in the string p, which is
12422 * ended by RExC_end. See also reg_skipcomment */
12423 const char *e = RExC_end;
12425 PERL_ARGS_ASSERT_REGPATWS;
12429 if ((len = is_PATWS_safe(p, e, UTF))) {
12432 else if (recognize_comment && *p == '#') {
12433 p = reg_skipcomment(pRExC_state, p);
12442 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12444 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12445 * sets up the bitmap and any flags, removing those code points from the
12446 * inversion list, setting it to NULL should it become completely empty */
12448 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12449 assert(PL_regkind[OP(node)] == ANYOF);
12451 ANYOF_BITMAP_ZERO(node);
12452 if (*invlist_ptr) {
12454 /* This gets set if we actually need to modify things */
12455 bool change_invlist = FALSE;
12459 /* Start looking through *invlist_ptr */
12460 invlist_iterinit(*invlist_ptr);
12461 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12465 if (end == UV_MAX && start <= 256) {
12466 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12468 else if (end >= 256) {
12469 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12472 /* Quit if are above what we should change */
12477 change_invlist = TRUE;
12479 /* Set all the bits in the range, up to the max that we are doing */
12480 high = (end < 255) ? end : 255;
12481 for (i = start; i <= (int) high; i++) {
12482 if (! ANYOF_BITMAP_TEST(node, i)) {
12483 ANYOF_BITMAP_SET(node, i);
12487 invlist_iterfinish(*invlist_ptr);
12489 /* Done with loop; remove any code points that are in the bitmap from
12490 * *invlist_ptr; similarly for code points above latin1 if we have a
12491 * flag to match all of them anyways */
12492 if (change_invlist) {
12493 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12495 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12496 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12499 /* If have completely emptied it, remove it completely */
12500 if (_invlist_len(*invlist_ptr) == 0) {
12501 SvREFCNT_dec_NN(*invlist_ptr);
12502 *invlist_ptr = NULL;
12507 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12508 Character classes ([:foo:]) can also be negated ([:^foo:]).
12509 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12510 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12511 but trigger failures because they are currently unimplemented. */
12513 #define POSIXCC_DONE(c) ((c) == ':')
12514 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12515 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12517 PERL_STATIC_INLINE I32
12518 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12520 I32 namedclass = OOB_NAMEDCLASS;
12522 PERL_ARGS_ASSERT_REGPPOSIXCC;
12524 if (value == '[' && RExC_parse + 1 < RExC_end &&
12525 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12526 POSIXCC(UCHARAT(RExC_parse)))
12528 const char c = UCHARAT(RExC_parse);
12529 char* const s = RExC_parse++;
12531 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12533 if (RExC_parse == RExC_end) {
12536 /* Try to give a better location for the error (than the end of
12537 * the string) by looking for the matching ']' */
12539 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12542 vFAIL2("Unmatched '%c' in POSIX class", c);
12544 /* Grandfather lone [:, [=, [. */
12548 const char* const t = RExC_parse++; /* skip over the c */
12551 if (UCHARAT(RExC_parse) == ']') {
12552 const char *posixcc = s + 1;
12553 RExC_parse++; /* skip over the ending ] */
12556 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12557 const I32 skip = t - posixcc;
12559 /* Initially switch on the length of the name. */
12562 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12563 this is the Perl \w
12565 namedclass = ANYOF_WORDCHAR;
12568 /* Names all of length 5. */
12569 /* alnum alpha ascii blank cntrl digit graph lower
12570 print punct space upper */
12571 /* Offset 4 gives the best switch position. */
12572 switch (posixcc[4]) {
12574 if (memEQ(posixcc, "alph", 4)) /* alpha */
12575 namedclass = ANYOF_ALPHA;
12578 if (memEQ(posixcc, "spac", 4)) /* space */
12579 namedclass = ANYOF_PSXSPC;
12582 if (memEQ(posixcc, "grap", 4)) /* graph */
12583 namedclass = ANYOF_GRAPH;
12586 if (memEQ(posixcc, "asci", 4)) /* ascii */
12587 namedclass = ANYOF_ASCII;
12590 if (memEQ(posixcc, "blan", 4)) /* blank */
12591 namedclass = ANYOF_BLANK;
12594 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12595 namedclass = ANYOF_CNTRL;
12598 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12599 namedclass = ANYOF_ALPHANUMERIC;
12602 if (memEQ(posixcc, "lowe", 4)) /* lower */
12603 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12604 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12605 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12608 if (memEQ(posixcc, "digi", 4)) /* digit */
12609 namedclass = ANYOF_DIGIT;
12610 else if (memEQ(posixcc, "prin", 4)) /* print */
12611 namedclass = ANYOF_PRINT;
12612 else if (memEQ(posixcc, "punc", 4)) /* punct */
12613 namedclass = ANYOF_PUNCT;
12618 if (memEQ(posixcc, "xdigit", 6))
12619 namedclass = ANYOF_XDIGIT;
12623 if (namedclass == OOB_NAMEDCLASS)
12625 "POSIX class [:%"UTF8f":] unknown",
12626 UTF8fARG(UTF, t - s - 1, s + 1));
12628 /* The #defines are structured so each complement is +1 to
12629 * the normal one */
12633 assert (posixcc[skip] == ':');
12634 assert (posixcc[skip+1] == ']');
12635 } else if (!SIZE_ONLY) {
12636 /* [[=foo=]] and [[.foo.]] are still future. */
12638 /* adjust RExC_parse so the warning shows after
12639 the class closes */
12640 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12642 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12645 /* Maternal grandfather:
12646 * "[:" ending in ":" but not in ":]" */
12648 vFAIL("Unmatched '[' in POSIX class");
12651 /* Grandfather lone [:, [=, [. */
12661 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12663 /* This applies some heuristics at the current parse position (which should
12664 * be at a '[') to see if what follows might be intended to be a [:posix:]
12665 * class. It returns true if it really is a posix class, of course, but it
12666 * also can return true if it thinks that what was intended was a posix
12667 * class that didn't quite make it.
12669 * It will return true for
12671 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12672 * ')' indicating the end of the (?[
12673 * [:any garbage including %^&$ punctuation:]
12675 * This is designed to be called only from S_handle_regex_sets; it could be
12676 * easily adapted to be called from the spot at the beginning of regclass()
12677 * that checks to see in a normal bracketed class if the surrounding []
12678 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12679 * change long-standing behavior, so I (khw) didn't do that */
12680 char* p = RExC_parse + 1;
12681 char first_char = *p;
12683 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12685 assert(*(p - 1) == '[');
12687 if (! POSIXCC(first_char)) {
12692 while (p < RExC_end && isWORDCHAR(*p)) p++;
12694 if (p >= RExC_end) {
12698 if (p - RExC_parse > 2 /* Got at least 1 word character */
12699 && (*p == first_char
12700 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12705 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12708 && p - RExC_parse > 2 /* [:] evaluates to colon;
12709 [::] is a bad posix class. */
12710 && first_char == *(p - 1));
12714 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12715 I32 *flagp, U32 depth,
12716 char * const oregcomp_parse)
12718 /* Handle the (?[...]) construct to do set operations */
12721 UV start, end; /* End points of code point ranges */
12723 char *save_end, *save_parse;
12728 const bool save_fold = FOLD;
12730 GET_RE_DEBUG_FLAGS_DECL;
12732 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12735 vFAIL("(?[...]) not valid in locale");
12737 RExC_uni_semantics = 1;
12739 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12740 * (such as EXACT). Thus we can skip most everything if just sizing. We
12741 * call regclass to handle '[]' so as to not have to reinvent its parsing
12742 * rules here (throwing away the size it computes each time). And, we exit
12743 * upon an unescaped ']' that isn't one ending a regclass. To do both
12744 * these things, we need to realize that something preceded by a backslash
12745 * is escaped, so we have to keep track of backslashes */
12747 UV depth = 0; /* how many nested (?[...]) constructs */
12749 Perl_ck_warner_d(aTHX_
12750 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12751 "The regex_sets feature is experimental" REPORT_LOCATION,
12752 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12754 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12755 RExC_precomp + (RExC_parse - RExC_precomp)));
12757 while (RExC_parse < RExC_end) {
12758 SV* current = NULL;
12759 RExC_parse = regpatws(pRExC_state, RExC_parse,
12760 TRUE); /* means recognize comments */
12761 switch (*RExC_parse) {
12763 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12768 /* Skip the next byte (which could cause us to end up in
12769 * the middle of a UTF-8 character, but since none of those
12770 * are confusable with anything we currently handle in this
12771 * switch (invariants all), it's safe. We'll just hit the
12772 * default: case next time and keep on incrementing until
12773 * we find one of the invariants we do handle. */
12778 /* If this looks like it is a [:posix:] class, leave the
12779 * parse pointer at the '[' to fool regclass() into
12780 * thinking it is part of a '[[:posix:]]'. That function
12781 * will use strict checking to force a syntax error if it
12782 * doesn't work out to a legitimate class */
12783 bool is_posix_class
12784 = could_it_be_a_POSIX_class(pRExC_state);
12785 if (! is_posix_class) {
12789 /* regclass() can only return RESTART_UTF8 if multi-char
12790 folds are allowed. */
12791 if (!regclass(pRExC_state, flagp,depth+1,
12792 is_posix_class, /* parse the whole char
12793 class only if not a
12795 FALSE, /* don't allow multi-char folds */
12796 TRUE, /* silence non-portable warnings. */
12798 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12801 /* function call leaves parse pointing to the ']', except
12802 * if we faked it */
12803 if (is_posix_class) {
12807 SvREFCNT_dec(current); /* In case it returned something */
12812 if (depth--) break;
12814 if (RExC_parse < RExC_end
12815 && *RExC_parse == ')')
12817 node = reganode(pRExC_state, ANYOF, 0);
12818 RExC_size += ANYOF_SKIP;
12819 nextchar(pRExC_state);
12820 Set_Node_Length(node,
12821 RExC_parse - oregcomp_parse + 1); /* MJD */
12830 FAIL("Syntax error in (?[...])");
12833 /* Pass 2 only after this. Everything in this construct is a
12834 * metacharacter. Operands begin with either a '\' (for an escape
12835 * sequence), or a '[' for a bracketed character class. Any other
12836 * character should be an operator, or parenthesis for grouping. Both
12837 * types of operands are handled by calling regclass() to parse them. It
12838 * is called with a parameter to indicate to return the computed inversion
12839 * list. The parsing here is implemented via a stack. Each entry on the
12840 * stack is a single character representing one of the operators, or the
12841 * '('; or else a pointer to an operand inversion list. */
12843 #define IS_OPERAND(a) (! SvIOK(a))
12845 /* The stack starts empty. It is a syntax error if the first thing parsed
12846 * is a binary operator; everything else is pushed on the stack. When an
12847 * operand is parsed, the top of the stack is examined. If it is a binary
12848 * operator, the item before it should be an operand, and both are replaced
12849 * by the result of doing that operation on the new operand and the one on
12850 * the stack. Thus a sequence of binary operands is reduced to a single
12851 * one before the next one is parsed.
12853 * A unary operator may immediately follow a binary in the input, for
12856 * When an operand is parsed and the top of the stack is a unary operator,
12857 * the operation is performed, and then the stack is rechecked to see if
12858 * this new operand is part of a binary operation; if so, it is handled as
12861 * A '(' is simply pushed on the stack; it is valid only if the stack is
12862 * empty, or the top element of the stack is an operator or another '('
12863 * (for which the parenthesized expression will become an operand). By the
12864 * time the corresponding ')' is parsed everything in between should have
12865 * been parsed and evaluated to a single operand (or else is a syntax
12866 * error), and is handled as a regular operand */
12868 sv_2mortal((SV *)(stack = newAV()));
12870 while (RExC_parse < RExC_end) {
12871 I32 top_index = av_tindex(stack);
12873 SV* current = NULL;
12875 /* Skip white space */
12876 RExC_parse = regpatws(pRExC_state, RExC_parse,
12877 TRUE /* means recognize comments */ );
12878 if (RExC_parse >= RExC_end) {
12879 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12881 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12888 if (av_tindex(stack) >= 0 /* This makes sure that we can
12889 safely subtract 1 from
12890 RExC_parse in the next clause.
12891 If we have something on the
12892 stack, we have parsed something
12894 && UCHARAT(RExC_parse - 1) == '('
12895 && RExC_parse < RExC_end)
12897 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12898 * This happens when we have some thing like
12900 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12902 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12904 * Here we would be handling the interpolated
12905 * '$thai_or_lao'. We handle this by a recursive call to
12906 * ourselves which returns the inversion list the
12907 * interpolated expression evaluates to. We use the flags
12908 * from the interpolated pattern. */
12909 U32 save_flags = RExC_flags;
12910 const char * const save_parse = ++RExC_parse;
12912 parse_lparen_question_flags(pRExC_state);
12914 if (RExC_parse == save_parse /* Makes sure there was at
12915 least one flag (or this
12916 embedding wasn't compiled)
12918 || RExC_parse >= RExC_end - 4
12919 || UCHARAT(RExC_parse) != ':'
12920 || UCHARAT(++RExC_parse) != '('
12921 || UCHARAT(++RExC_parse) != '?'
12922 || UCHARAT(++RExC_parse) != '[')
12925 /* In combination with the above, this moves the
12926 * pointer to the point just after the first erroneous
12927 * character (or if there are no flags, to where they
12928 * should have been) */
12929 if (RExC_parse >= RExC_end - 4) {
12930 RExC_parse = RExC_end;
12932 else if (RExC_parse != save_parse) {
12933 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12935 vFAIL("Expecting '(?flags:(?[...'");
12938 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12939 depth+1, oregcomp_parse);
12941 /* Here, 'current' contains the embedded expression's
12942 * inversion list, and RExC_parse points to the trailing
12943 * ']'; the next character should be the ')' which will be
12944 * paired with the '(' that has been put on the stack, so
12945 * the whole embedded expression reduces to '(operand)' */
12948 RExC_flags = save_flags;
12949 goto handle_operand;
12954 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12955 vFAIL("Unexpected character");
12958 /* regclass() can only return RESTART_UTF8 if multi-char
12959 folds are allowed. */
12960 if (!regclass(pRExC_state, flagp,depth+1,
12961 TRUE, /* means parse just the next thing */
12962 FALSE, /* don't allow multi-char folds */
12963 FALSE, /* don't silence non-portable warnings. */
12965 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12967 /* regclass() will return with parsing just the \ sequence,
12968 * leaving the parse pointer at the next thing to parse */
12970 goto handle_operand;
12972 case '[': /* Is a bracketed character class */
12974 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12976 if (! is_posix_class) {
12980 /* regclass() can only return RESTART_UTF8 if multi-char
12981 folds are allowed. */
12982 if(!regclass(pRExC_state, flagp,depth+1,
12983 is_posix_class, /* parse the whole char class
12984 only if not a posix class */
12985 FALSE, /* don't allow multi-char folds */
12986 FALSE, /* don't silence non-portable warnings. */
12988 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12990 /* function call leaves parse pointing to the ']', except if we
12992 if (is_posix_class) {
12996 goto handle_operand;
13005 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13006 || ! IS_OPERAND(*top_ptr))
13009 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13011 av_push(stack, newSVuv(curchar));
13015 av_push(stack, newSVuv(curchar));
13019 if (top_index >= 0) {
13020 top_ptr = av_fetch(stack, top_index, FALSE);
13022 if (IS_OPERAND(*top_ptr)) {
13024 vFAIL("Unexpected '(' with no preceding operator");
13027 av_push(stack, newSVuv(curchar));
13034 || ! (current = av_pop(stack))
13035 || ! IS_OPERAND(current)
13036 || ! (lparen = av_pop(stack))
13037 || IS_OPERAND(lparen)
13038 || SvUV(lparen) != '(')
13040 SvREFCNT_dec(current);
13042 vFAIL("Unexpected ')'");
13045 SvREFCNT_dec_NN(lparen);
13052 /* Here, we have an operand to process, in 'current' */
13054 if (top_index < 0) { /* Just push if stack is empty */
13055 av_push(stack, current);
13058 SV* top = av_pop(stack);
13060 char current_operator;
13062 if (IS_OPERAND(top)) {
13063 SvREFCNT_dec_NN(top);
13064 SvREFCNT_dec_NN(current);
13065 vFAIL("Operand with no preceding operator");
13067 current_operator = (char) SvUV(top);
13068 switch (current_operator) {
13069 case '(': /* Push the '(' back on followed by the new
13071 av_push(stack, top);
13072 av_push(stack, current);
13073 SvREFCNT_inc(top); /* Counters the '_dec' done
13074 just after the 'break', so
13075 it doesn't get wrongly freed
13080 _invlist_invert(current);
13082 /* Unlike binary operators, the top of the stack,
13083 * now that this unary one has been popped off, may
13084 * legally be an operator, and we now have operand
13087 SvREFCNT_dec_NN(top);
13088 goto handle_operand;
13091 prev = av_pop(stack);
13092 _invlist_intersection(prev,
13095 av_push(stack, current);
13100 prev = av_pop(stack);
13101 _invlist_union(prev, current, ¤t);
13102 av_push(stack, current);
13106 prev = av_pop(stack);;
13107 _invlist_subtract(prev, current, ¤t);
13108 av_push(stack, current);
13111 case '^': /* The union minus the intersection */
13117 prev = av_pop(stack);
13118 _invlist_union(prev, current, &u);
13119 _invlist_intersection(prev, current, &i);
13120 /* _invlist_subtract will overwrite current
13121 without freeing what it already contains */
13123 _invlist_subtract(u, i, ¤t);
13124 av_push(stack, current);
13125 SvREFCNT_dec_NN(i);
13126 SvREFCNT_dec_NN(u);
13127 SvREFCNT_dec_NN(element);
13132 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13134 SvREFCNT_dec_NN(top);
13135 SvREFCNT_dec(prev);
13139 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13142 if (av_tindex(stack) < 0 /* Was empty */
13143 || ((final = av_pop(stack)) == NULL)
13144 || ! IS_OPERAND(final)
13145 || av_tindex(stack) >= 0) /* More left on stack */
13147 vFAIL("Incomplete expression within '(?[ ])'");
13150 /* Here, 'final' is the resultant inversion list from evaluating the
13151 * expression. Return it if so requested */
13152 if (return_invlist) {
13153 *return_invlist = final;
13157 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13158 * expecting a string of ranges and individual code points */
13159 invlist_iterinit(final);
13160 result_string = newSVpvs("");
13161 while (invlist_iternext(final, &start, &end)) {
13162 if (start == end) {
13163 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13166 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13171 save_parse = RExC_parse;
13172 RExC_parse = SvPV(result_string, len);
13173 save_end = RExC_end;
13174 RExC_end = RExC_parse + len;
13176 /* We turn off folding around the call, as the class we have constructed
13177 * already has all folding taken into consideration, and we don't want
13178 * regclass() to add to that */
13179 RExC_flags &= ~RXf_PMf_FOLD;
13180 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13182 node = regclass(pRExC_state, flagp,depth+1,
13183 FALSE, /* means parse the whole char class */
13184 FALSE, /* don't allow multi-char folds */
13185 TRUE, /* silence non-portable warnings. The above may very
13186 well have generated non-portable code points, but
13187 they're valid on this machine */
13190 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13193 RExC_flags |= RXf_PMf_FOLD;
13195 RExC_parse = save_parse + 1;
13196 RExC_end = save_end;
13197 SvREFCNT_dec_NN(final);
13198 SvREFCNT_dec_NN(result_string);
13200 nextchar(pRExC_state);
13201 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13207 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13209 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13210 * innocent-looking character class, like /[ks]/i won't have to go out to
13211 * disk to find the possible matches.
13213 * This should be called only for a Latin1-range code points, cp, which is
13214 * known to be involved in a simple fold with other code points above
13215 * Latin1. It would give false results if /aa has been specified.
13216 * Multi-char folds are outside the scope of this, and must be handled
13219 * XXX It would be better to generate these via regen, in case a new
13220 * version of the Unicode standard adds new mappings, though that is not
13221 * really likely, and may be caught by the default: case of the switch
13224 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13226 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13232 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13236 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13239 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13240 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13242 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13243 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13244 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13246 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13247 *invlist = add_cp_to_invlist(*invlist,
13248 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13250 case LATIN_SMALL_LETTER_SHARP_S:
13251 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13254 /* Use deprecated warning to increase the chances of this being
13256 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13261 /* The names of properties whose definitions are not known at compile time are
13262 * stored in this SV, after a constant heading. So if the length has been
13263 * changed since initialization, then there is a run-time definition. */
13264 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13265 (SvCUR(listsv) != initial_listsv_len)
13268 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13269 const bool stop_at_1, /* Just parse the next thing, don't
13270 look for a full character class */
13271 bool allow_multi_folds,
13272 const bool silence_non_portable, /* Don't output warnings
13275 SV** ret_invlist) /* Return an inversion list, not a node */
13277 /* parse a bracketed class specification. Most of these will produce an
13278 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13279 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13280 * under /i with multi-character folds: it will be rewritten following the
13281 * paradigm of this example, where the <multi-fold>s are characters which
13282 * fold to multiple character sequences:
13283 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13284 * gets effectively rewritten as:
13285 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13286 * reg() gets called (recursively) on the rewritten version, and this
13287 * function will return what it constructs. (Actually the <multi-fold>s
13288 * aren't physically removed from the [abcdefghi], it's just that they are
13289 * ignored in the recursion by means of a flag:
13290 * <RExC_in_multi_char_class>.)
13292 * ANYOF nodes contain a bit map for the first 256 characters, with the
13293 * corresponding bit set if that character is in the list. For characters
13294 * above 255, a range list or swash is used. There are extra bits for \w,
13295 * etc. in locale ANYOFs, as what these match is not determinable at
13298 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13299 * to be restarted. This can only happen if ret_invlist is non-NULL.
13302 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13304 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13307 IV namedclass = OOB_NAMEDCLASS;
13308 char *rangebegin = NULL;
13309 bool need_class = 0;
13311 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13312 than just initialized. */
13313 SV* properties = NULL; /* Code points that match \p{} \P{} */
13314 SV* posixes = NULL; /* Code points that match classes like [:word:],
13315 extended beyond the Latin1 range. These have to
13316 be kept separate from other code points for much
13317 of this function because their handling is
13318 different under /i, and for most classes under
13320 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13321 separate for a while from the non-complemented
13322 versions because of complications with /d
13324 UV element_count = 0; /* Number of distinct elements in the class.
13325 Optimizations may be possible if this is tiny */
13326 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13327 character; used under /i */
13329 char * stop_ptr = RExC_end; /* where to stop parsing */
13330 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13332 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13334 /* Unicode properties are stored in a swash; this holds the current one
13335 * being parsed. If this swash is the only above-latin1 component of the
13336 * character class, an optimization is to pass it directly on to the
13337 * execution engine. Otherwise, it is set to NULL to indicate that there
13338 * are other things in the class that have to be dealt with at execution
13340 SV* swash = NULL; /* Code points that match \p{} \P{} */
13342 /* Set if a component of this character class is user-defined; just passed
13343 * on to the engine */
13344 bool has_user_defined_property = FALSE;
13346 /* inversion list of code points this node matches only when the target
13347 * string is in UTF-8. (Because is under /d) */
13348 SV* depends_list = NULL;
13350 /* Inversion list of code points this node matches regardless of things
13351 * like locale, folding, utf8ness of the target string */
13352 SV* cp_list = NULL;
13354 /* Like cp_list, but code points on this list need to be checked for things
13355 * that fold to/from them under /i */
13356 SV* cp_foldable_list = NULL;
13358 /* Like cp_list, but code points on this list are valid only when the
13359 * runtime locale is UTF-8 */
13360 SV* only_utf8_locale_list = NULL;
13363 /* In a range, counts how many 0-2 of the ends of it came from literals,
13364 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13365 UV literal_endpoint = 0;
13367 bool invert = FALSE; /* Is this class to be complemented */
13369 bool warn_super = ALWAYS_WARN_SUPER;
13371 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13372 case we need to change the emitted regop to an EXACT. */
13373 const char * orig_parse = RExC_parse;
13374 const SSize_t orig_size = RExC_size;
13375 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13376 GET_RE_DEBUG_FLAGS_DECL;
13378 PERL_ARGS_ASSERT_REGCLASS;
13380 PERL_UNUSED_ARG(depth);
13383 DEBUG_PARSE("clas");
13385 /* Assume we are going to generate an ANYOF node. */
13386 ret = reganode(pRExC_state, ANYOF, 0);
13389 RExC_size += ANYOF_SKIP;
13390 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13393 ANYOF_FLAGS(ret) = 0;
13395 RExC_emit += ANYOF_SKIP;
13396 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13397 initial_listsv_len = SvCUR(listsv);
13398 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13402 RExC_parse = regpatws(pRExC_state, RExC_parse,
13403 FALSE /* means don't recognize comments */ );
13406 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13409 allow_multi_folds = FALSE;
13412 RExC_parse = regpatws(pRExC_state, RExC_parse,
13413 FALSE /* means don't recognize comments */ );
13417 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13418 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13419 const char *s = RExC_parse;
13420 const char c = *s++;
13422 while (isWORDCHAR(*s))
13424 if (*s && c == *s && s[1] == ']') {
13425 SAVEFREESV(RExC_rx_sv);
13427 "POSIX syntax [%c %c] belongs inside character classes",
13429 (void)ReREFCNT_inc(RExC_rx_sv);
13433 /* If the caller wants us to just parse a single element, accomplish this
13434 * by faking the loop ending condition */
13435 if (stop_at_1 && RExC_end > RExC_parse) {
13436 stop_ptr = RExC_parse + 1;
13439 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13440 if (UCHARAT(RExC_parse) == ']')
13441 goto charclassloop;
13445 if (RExC_parse >= stop_ptr) {
13450 RExC_parse = regpatws(pRExC_state, RExC_parse,
13451 FALSE /* means don't recognize comments */ );
13454 if (UCHARAT(RExC_parse) == ']') {
13460 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13461 save_value = value;
13462 save_prevvalue = prevvalue;
13465 rangebegin = RExC_parse;
13469 value = utf8n_to_uvchr((U8*)RExC_parse,
13470 RExC_end - RExC_parse,
13471 &numlen, UTF8_ALLOW_DEFAULT);
13472 RExC_parse += numlen;
13475 value = UCHARAT(RExC_parse++);
13478 && RExC_parse < RExC_end
13479 && POSIXCC(UCHARAT(RExC_parse)))
13481 namedclass = regpposixcc(pRExC_state, value, strict);
13483 else if (value == '\\') {
13485 value = utf8n_to_uvchr((U8*)RExC_parse,
13486 RExC_end - RExC_parse,
13487 &numlen, UTF8_ALLOW_DEFAULT);
13488 RExC_parse += numlen;
13491 value = UCHARAT(RExC_parse++);
13493 /* Some compilers cannot handle switching on 64-bit integer
13494 * values, therefore value cannot be an UV. Yes, this will
13495 * be a problem later if we want switch on Unicode.
13496 * A similar issue a little bit later when switching on
13497 * namedclass. --jhi */
13499 /* If the \ is escaping white space when white space is being
13500 * skipped, it means that that white space is wanted literally, and
13501 * is already in 'value'. Otherwise, need to translate the escape
13502 * into what it signifies. */
13503 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13505 case 'w': namedclass = ANYOF_WORDCHAR; break;
13506 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13507 case 's': namedclass = ANYOF_SPACE; break;
13508 case 'S': namedclass = ANYOF_NSPACE; break;
13509 case 'd': namedclass = ANYOF_DIGIT; break;
13510 case 'D': namedclass = ANYOF_NDIGIT; break;
13511 case 'v': namedclass = ANYOF_VERTWS; break;
13512 case 'V': namedclass = ANYOF_NVERTWS; break;
13513 case 'h': namedclass = ANYOF_HORIZWS; break;
13514 case 'H': namedclass = ANYOF_NHORIZWS; break;
13515 case 'N': /* Handle \N{NAME} in class */
13517 /* We only pay attention to the first char of
13518 multichar strings being returned. I kinda wonder
13519 if this makes sense as it does change the behaviour
13520 from earlier versions, OTOH that behaviour was broken
13522 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13523 TRUE, /* => charclass */
13526 if (*flagp & RESTART_UTF8)
13527 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13537 /* We will handle any undefined properties ourselves */
13538 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13539 /* And we actually would prefer to get
13540 * the straight inversion list of the
13541 * swash, since we will be accessing it
13542 * anyway, to save a little time */
13543 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13545 if (RExC_parse >= RExC_end)
13546 vFAIL2("Empty \\%c{}", (U8)value);
13547 if (*RExC_parse == '{') {
13548 const U8 c = (U8)value;
13549 e = strchr(RExC_parse++, '}');
13551 vFAIL2("Missing right brace on \\%c{}", c);
13552 while (isSPACE(*RExC_parse))
13554 if (e == RExC_parse)
13555 vFAIL2("Empty \\%c{}", c);
13556 n = e - RExC_parse;
13557 while (isSPACE(*(RExC_parse + n - 1)))
13568 if (UCHARAT(RExC_parse) == '^') {
13571 /* toggle. (The rhs xor gets the single bit that
13572 * differs between P and p; the other xor inverts just
13574 value ^= 'P' ^ 'p';
13576 while (isSPACE(*RExC_parse)) {
13581 /* Try to get the definition of the property into
13582 * <invlist>. If /i is in effect, the effective property
13583 * will have its name be <__NAME_i>. The design is
13584 * discussed in commit
13585 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13586 name = savepv(Perl_form(aTHX_
13588 (FOLD) ? "__" : "",
13594 /* Look up the property name, and get its swash and
13595 * inversion list, if the property is found */
13597 SvREFCNT_dec_NN(swash);
13599 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13602 NULL, /* No inversion list */
13605 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13606 HV* curpkg = (IN_PERL_COMPILETIME)
13608 : CopSTASH(PL_curcop);
13610 SvREFCNT_dec_NN(swash);
13614 /* Here didn't find it. It could be a user-defined
13615 * property that will be available at run-time. If we
13616 * accept only compile-time properties, is an error;
13617 * otherwise add it to the list for run-time look up */
13619 RExC_parse = e + 1;
13621 "Property '%"UTF8f"' is unknown",
13622 UTF8fARG(UTF, n, name));
13625 /* If the property name doesn't already have a package
13626 * name, add the current one to it so that it can be
13627 * referred to outside it. [perl #121777] */
13628 if (curpkg && ! instr(name, "::")) {
13629 char* pkgname = HvNAME(curpkg);
13630 if (strNE(pkgname, "main")) {
13631 char* full_name = Perl_form(aTHX_
13635 n = strlen(full_name);
13637 name = savepvn(full_name, n);
13640 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13641 (value == 'p' ? '+' : '!'),
13642 UTF8fARG(UTF, n, name));
13643 has_user_defined_property = TRUE;
13645 /* We don't know yet, so have to assume that the
13646 * property could match something in the Latin1 range,
13647 * hence something that isn't utf8. Note that this
13648 * would cause things in <depends_list> to match
13649 * inappropriately, except that any \p{}, including
13650 * this one forces Unicode semantics, which means there
13651 * is no <depends_list> */
13652 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13656 /* Here, did get the swash and its inversion list. If
13657 * the swash is from a user-defined property, then this
13658 * whole character class should be regarded as such */
13659 if (swash_init_flags
13660 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13662 has_user_defined_property = TRUE;
13665 /* We warn on matching an above-Unicode code point
13666 * if the match would return true, except don't
13667 * warn for \p{All}, which has exactly one element
13669 (_invlist_contains_cp(invlist, 0x110000)
13670 && (! (_invlist_len(invlist) == 1
13671 && *invlist_array(invlist) == 0)))
13677 /* Invert if asking for the complement */
13678 if (value == 'P') {
13679 _invlist_union_complement_2nd(properties,
13683 /* The swash can't be used as-is, because we've
13684 * inverted things; delay removing it to here after
13685 * have copied its invlist above */
13686 SvREFCNT_dec_NN(swash);
13690 _invlist_union(properties, invlist, &properties);
13695 RExC_parse = e + 1;
13696 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13699 /* \p means they want Unicode semantics */
13700 RExC_uni_semantics = 1;
13703 case 'n': value = '\n'; break;
13704 case 'r': value = '\r'; break;
13705 case 't': value = '\t'; break;
13706 case 'f': value = '\f'; break;
13707 case 'b': value = '\b'; break;
13708 case 'e': value = ASCII_TO_NATIVE('\033');break;
13709 case 'a': value = '\a'; break;
13711 RExC_parse--; /* function expects to be pointed at the 'o' */
13713 const char* error_msg;
13714 bool valid = grok_bslash_o(&RExC_parse,
13717 SIZE_ONLY, /* warnings in pass
13720 silence_non_portable,
13726 if (PL_encoding && value < 0x100) {
13727 goto recode_encoding;
13731 RExC_parse--; /* function expects to be pointed at the 'x' */
13733 const char* error_msg;
13734 bool valid = grok_bslash_x(&RExC_parse,
13737 TRUE, /* Output warnings */
13739 silence_non_portable,
13745 if (PL_encoding && value < 0x100)
13746 goto recode_encoding;
13749 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13751 case '0': case '1': case '2': case '3': case '4':
13752 case '5': case '6': case '7':
13754 /* Take 1-3 octal digits */
13755 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13756 numlen = (strict) ? 4 : 3;
13757 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13758 RExC_parse += numlen;
13761 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13762 vFAIL("Need exactly 3 octal digits");
13764 else if (! SIZE_ONLY /* like \08, \178 */
13766 && RExC_parse < RExC_end
13767 && isDIGIT(*RExC_parse)
13768 && ckWARN(WARN_REGEXP))
13770 SAVEFREESV(RExC_rx_sv);
13771 reg_warn_non_literal_string(
13773 form_short_octal_warning(RExC_parse, numlen));
13774 (void)ReREFCNT_inc(RExC_rx_sv);
13777 if (PL_encoding && value < 0x100)
13778 goto recode_encoding;
13782 if (! RExC_override_recoding) {
13783 SV* enc = PL_encoding;
13784 value = reg_recode((const char)(U8)value, &enc);
13787 vFAIL("Invalid escape in the specified encoding");
13789 else if (SIZE_ONLY) {
13790 ckWARNreg(RExC_parse,
13791 "Invalid escape in the specified encoding");
13797 /* Allow \_ to not give an error */
13798 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13800 vFAIL2("Unrecognized escape \\%c in character class",
13804 SAVEFREESV(RExC_rx_sv);
13805 ckWARN2reg(RExC_parse,
13806 "Unrecognized escape \\%c in character class passed through",
13808 (void)ReREFCNT_inc(RExC_rx_sv);
13812 } /* End of switch on char following backslash */
13813 } /* end of handling backslash escape sequences */
13816 literal_endpoint++;
13819 /* Here, we have the current token in 'value' */
13821 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13824 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13825 * literal, as is the character that began the false range, i.e.
13826 * the 'a' in the examples */
13829 const int w = (RExC_parse >= rangebegin)
13830 ? RExC_parse - rangebegin
13834 "False [] range \"%"UTF8f"\"",
13835 UTF8fARG(UTF, w, rangebegin));
13838 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13839 ckWARN2reg(RExC_parse,
13840 "False [] range \"%"UTF8f"\"",
13841 UTF8fARG(UTF, w, rangebegin));
13842 (void)ReREFCNT_inc(RExC_rx_sv);
13843 cp_list = add_cp_to_invlist(cp_list, '-');
13844 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13849 range = 0; /* this was not a true range */
13850 element_count += 2; /* So counts for three values */
13853 classnum = namedclass_to_classnum(namedclass);
13855 if (LOC && namedclass < ANYOF_POSIXL_MAX
13856 #ifndef HAS_ISASCII
13857 && classnum != _CC_ASCII
13860 /* What the Posix classes (like \w, [:space:]) match in locale
13861 * isn't knowable under locale until actual match time. Room
13862 * must be reserved (one time per outer bracketed class) to
13863 * store such classes. The space will contain a bit for each
13864 * named class that is to be matched against. This isn't
13865 * needed for \p{} and pseudo-classes, as they are not affected
13866 * by locale, and hence are dealt with separately */
13867 if (! need_class) {
13870 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13873 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13875 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13876 ANYOF_POSIXL_ZERO(ret);
13879 /* Coverity thinks it is possible for this to be negative; both
13880 * jhi and khw think it's not, but be safer */
13881 assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13882 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13884 /* See if it already matches the complement of this POSIX
13886 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13887 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13891 posixl_matches_all = TRUE;
13892 break; /* No need to continue. Since it matches both
13893 e.g., \w and \W, it matches everything, and the
13894 bracketed class can be optimized into qr/./s */
13897 /* Add this class to those that should be checked at runtime */
13898 ANYOF_POSIXL_SET(ret, namedclass);
13900 /* The above-Latin1 characters are not subject to locale rules.
13901 * Just add them, in the second pass, to the
13902 * unconditionally-matched list */
13904 SV* scratch_list = NULL;
13906 /* Get the list of the above-Latin1 code points this
13908 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13909 PL_XPosix_ptrs[classnum],
13911 /* Odd numbers are complements, like
13912 * NDIGIT, NASCII, ... */
13913 namedclass % 2 != 0,
13915 /* Checking if 'cp_list' is NULL first saves an extra
13916 * clone. Its reference count will be decremented at the
13917 * next union, etc, or if this is the only instance, at the
13918 * end of the routine */
13920 cp_list = scratch_list;
13923 _invlist_union(cp_list, scratch_list, &cp_list);
13924 SvREFCNT_dec_NN(scratch_list);
13926 continue; /* Go get next character */
13929 else if (! SIZE_ONLY) {
13931 /* Here, not in pass1 (in that pass we skip calculating the
13932 * contents of this class), and is /l, or is a POSIX class for
13933 * which /l doesn't matter (or is a Unicode property, which is
13934 * skipped here). */
13935 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13936 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13938 /* Here, should be \h, \H, \v, or \V. None of /d, /i
13939 * nor /l make a difference in what these match,
13940 * therefore we just add what they match to cp_list. */
13941 if (classnum != _CC_VERTSPACE) {
13942 assert( namedclass == ANYOF_HORIZWS
13943 || namedclass == ANYOF_NHORIZWS);
13945 /* It turns out that \h is just a synonym for
13947 classnum = _CC_BLANK;
13950 _invlist_union_maybe_complement_2nd(
13952 PL_XPosix_ptrs[classnum],
13953 namedclass % 2 != 0, /* Complement if odd
13954 (NHORIZWS, NVERTWS)
13959 else { /* Garden variety class. If is NASCII, NDIGIT, ...
13960 complement and use nposixes */
13961 SV** posixes_ptr = namedclass % 2 == 0
13964 SV** source_ptr = &PL_XPosix_ptrs[classnum];
13965 _invlist_union_maybe_complement_2nd(
13968 namedclass % 2 != 0,
13971 continue; /* Go get next character */
13973 } /* end of namedclass \blah */
13975 /* Here, we have a single value. If 'range' is set, it is the ending
13976 * of a range--check its validity. Later, we will handle each
13977 * individual code point in the range. If 'range' isn't set, this
13978 * could be the beginning of a range, so check for that by looking
13979 * ahead to see if the next real character to be processed is the range
13980 * indicator--the minus sign */
13983 RExC_parse = regpatws(pRExC_state, RExC_parse,
13984 FALSE /* means don't recognize comments */ );
13988 if (prevvalue > value) /* b-a */ {
13989 const int w = RExC_parse - rangebegin;
13991 "Invalid [] range \"%"UTF8f"\"",
13992 UTF8fARG(UTF, w, rangebegin));
13993 range = 0; /* not a valid range */
13997 prevvalue = value; /* save the beginning of the potential range */
13998 if (! stop_at_1 /* Can't be a range if parsing just one thing */
13999 && *RExC_parse == '-')
14001 char* next_char_ptr = RExC_parse + 1;
14002 if (skip_white) { /* Get the next real char after the '-' */
14003 next_char_ptr = regpatws(pRExC_state,
14005 FALSE); /* means don't recognize
14009 /* If the '-' is at the end of the class (just before the ']',
14010 * it is a literal minus; otherwise it is a range */
14011 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14012 RExC_parse = next_char_ptr;
14014 /* a bad range like \w-, [:word:]- ? */
14015 if (namedclass > OOB_NAMEDCLASS) {
14016 if (strict || ckWARN(WARN_REGEXP)) {
14018 RExC_parse >= rangebegin ?
14019 RExC_parse - rangebegin : 0;
14021 vFAIL4("False [] range \"%*.*s\"",
14026 "False [] range \"%*.*s\"",
14031 cp_list = add_cp_to_invlist(cp_list, '-');
14035 range = 1; /* yeah, it's a range! */
14036 continue; /* but do it the next time */
14041 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14044 /* non-Latin1 code point implies unicode semantics. Must be set in
14045 * pass1 so is there for the whole of pass 2 */
14047 RExC_uni_semantics = 1;
14050 /* Ready to process either the single value, or the completed range.
14051 * For single-valued non-inverted ranges, we consider the possibility
14052 * of multi-char folds. (We made a conscious decision to not do this
14053 * for the other cases because it can often lead to non-intuitive
14054 * results. For example, you have the peculiar case that:
14055 * "s s" =~ /^[^\xDF]+$/i => Y
14056 * "ss" =~ /^[^\xDF]+$/i => N
14058 * See [perl #89750] */
14059 if (FOLD && allow_multi_folds && value == prevvalue) {
14060 if (value == LATIN_SMALL_LETTER_SHARP_S
14061 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14064 /* Here <value> is indeed a multi-char fold. Get what it is */
14066 U8 foldbuf[UTF8_MAXBYTES_CASE];
14069 UV folded = _to_uni_fold_flags(
14073 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14074 ? FOLD_FLAGS_NOMIX_ASCII
14078 /* Here, <folded> should be the first character of the
14079 * multi-char fold of <value>, with <foldbuf> containing the
14080 * whole thing. But, if this fold is not allowed (because of
14081 * the flags), <fold> will be the same as <value>, and should
14082 * be processed like any other character, so skip the special
14084 if (folded != value) {
14086 /* Skip if we are recursed, currently parsing the class
14087 * again. Otherwise add this character to the list of
14088 * multi-char folds. */
14089 if (! RExC_in_multi_char_class) {
14090 AV** this_array_ptr;
14092 STRLEN cp_count = utf8_length(foldbuf,
14093 foldbuf + foldlen);
14094 SV* multi_fold = sv_2mortal(newSVpvs(""));
14096 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14099 if (! multi_char_matches) {
14100 multi_char_matches = newAV();
14103 /* <multi_char_matches> is actually an array of arrays.
14104 * There will be one or two top-level elements: [2],
14105 * and/or [3]. The [2] element is an array, each
14106 * element thereof is a character which folds to TWO
14107 * characters; [3] is for folds to THREE characters.
14108 * (Unicode guarantees a maximum of 3 characters in any
14109 * fold.) When we rewrite the character class below,
14110 * we will do so such that the longest folds are
14111 * written first, so that it prefers the longest
14112 * matching strings first. This is done even if it
14113 * turns out that any quantifier is non-greedy, out of
14114 * programmer laziness. Tom Christiansen has agreed
14115 * that this is ok. This makes the test for the
14116 * ligature 'ffi' come before the test for 'ff' */
14117 if (av_exists(multi_char_matches, cp_count)) {
14118 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14120 this_array = *this_array_ptr;
14123 this_array = newAV();
14124 av_store(multi_char_matches, cp_count,
14127 av_push(this_array, multi_fold);
14130 /* This element should not be processed further in this
14133 value = save_value;
14134 prevvalue = save_prevvalue;
14140 /* Deal with this element of the class */
14143 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14146 SV* this_range = _new_invlist(1);
14147 _append_range_to_invlist(this_range, prevvalue, value);
14149 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14150 * If this range was specified using something like 'i-j', we want
14151 * to include only the 'i' and the 'j', and not anything in
14152 * between, so exclude non-ASCII, non-alphabetics from it.
14153 * However, if the range was specified with something like
14154 * [\x89-\x91] or [\x89-j], all code points within it should be
14155 * included. literal_endpoint==2 means both ends of the range used
14156 * a literal character, not \x{foo} */
14157 if (literal_endpoint == 2
14158 && ((prevvalue >= 'a' && value <= 'z')
14159 || (prevvalue >= 'A' && value <= 'Z')))
14161 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14164 /* Since this above only contains ascii, the intersection of it
14165 * with anything will still yield only ascii */
14166 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14169 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14170 literal_endpoint = 0;
14174 range = 0; /* this range (if it was one) is done now */
14175 } /* End of loop through all the text within the brackets */
14177 /* If anything in the class expands to more than one character, we have to
14178 * deal with them by building up a substitute parse string, and recursively
14179 * calling reg() on it, instead of proceeding */
14180 if (multi_char_matches) {
14181 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14184 char *save_end = RExC_end;
14185 char *save_parse = RExC_parse;
14186 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14191 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14192 because too confusing */
14194 sv_catpv(substitute_parse, "(?:");
14198 /* Look at the longest folds first */
14199 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14201 if (av_exists(multi_char_matches, cp_count)) {
14202 AV** this_array_ptr;
14205 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14207 while ((this_sequence = av_pop(*this_array_ptr)) !=
14210 if (! first_time) {
14211 sv_catpv(substitute_parse, "|");
14213 first_time = FALSE;
14215 sv_catpv(substitute_parse, SvPVX(this_sequence));
14220 /* If the character class contains anything else besides these
14221 * multi-character folds, have to include it in recursive parsing */
14222 if (element_count) {
14223 sv_catpv(substitute_parse, "|[");
14224 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14225 sv_catpv(substitute_parse, "]");
14228 sv_catpv(substitute_parse, ")");
14231 /* This is a way to get the parse to skip forward a whole named
14232 * sequence instead of matching the 2nd character when it fails the
14234 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14238 RExC_parse = SvPV(substitute_parse, len);
14239 RExC_end = RExC_parse + len;
14240 RExC_in_multi_char_class = 1;
14241 RExC_emit = (regnode *)orig_emit;
14243 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14245 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14247 RExC_parse = save_parse;
14248 RExC_end = save_end;
14249 RExC_in_multi_char_class = 0;
14250 SvREFCNT_dec_NN(multi_char_matches);
14254 /* Here, we've gone through the entire class and dealt with multi-char
14255 * folds. We are now in a position that we can do some checks to see if we
14256 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14257 * Currently we only do two checks:
14258 * 1) is in the unlikely event that the user has specified both, eg. \w and
14259 * \W under /l, then the class matches everything. (This optimization
14260 * is done only to make the optimizer code run later work.)
14261 * 2) if the character class contains only a single element (including a
14262 * single range), we see if there is an equivalent node for it.
14263 * Other checks are possible */
14264 if (! ret_invlist /* Can't optimize if returning the constructed
14266 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14271 if (UNLIKELY(posixl_matches_all)) {
14274 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14275 \w or [:digit:] or \p{foo}
14278 /* All named classes are mapped into POSIXish nodes, with its FLAG
14279 * argument giving which class it is */
14280 switch ((I32)namedclass) {
14281 case ANYOF_UNIPROP:
14284 /* These don't depend on the charset modifiers. They always
14285 * match under /u rules */
14286 case ANYOF_NHORIZWS:
14287 case ANYOF_HORIZWS:
14288 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14291 case ANYOF_NVERTWS:
14296 /* The actual POSIXish node for all the rest depends on the
14297 * charset modifier. The ones in the first set depend only on
14298 * ASCII or, if available on this platform, locale */
14302 op = (LOC) ? POSIXL : POSIXA;
14313 /* under /a could be alpha */
14315 if (ASCII_RESTRICTED) {
14316 namedclass = ANYOF_ALPHA + (namedclass % 2);
14324 /* The rest have more possibilities depending on the charset.
14325 * We take advantage of the enum ordering of the charset
14326 * modifiers to get the exact node type, */
14328 op = POSIXD + get_regex_charset(RExC_flags);
14329 if (op > POSIXA) { /* /aa is same as /a */
14334 /* The odd numbered ones are the complements of the
14335 * next-lower even number one */
14336 if (namedclass % 2 == 1) {
14340 arg = namedclass_to_classnum(namedclass);
14344 else if (value == prevvalue) {
14346 /* Here, the class consists of just a single code point */
14349 if (! LOC && value == '\n') {
14350 op = REG_ANY; /* Optimize [^\n] */
14351 *flagp |= HASWIDTH|SIMPLE;
14355 else if (value < 256 || UTF) {
14357 /* Optimize a single value into an EXACTish node, but not if it
14358 * would require converting the pattern to UTF-8. */
14359 op = compute_EXACTish(pRExC_state);
14361 } /* Otherwise is a range */
14362 else if (! LOC) { /* locale could vary these */
14363 if (prevvalue == '0') {
14364 if (value == '9') {
14369 else if (prevvalue == 'A') {
14372 && literal_endpoint == 2
14375 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14379 else if (prevvalue == 'a') {
14382 && literal_endpoint == 2
14385 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14391 /* Here, we have changed <op> away from its initial value iff we found
14392 * an optimization */
14395 /* Throw away this ANYOF regnode, and emit the calculated one,
14396 * which should correspond to the beginning, not current, state of
14398 const char * cur_parse = RExC_parse;
14399 RExC_parse = (char *)orig_parse;
14403 /* To get locale nodes to not use the full ANYOF size would
14404 * require moving the code above that writes the portions
14405 * of it that aren't in other nodes to after this point.
14406 * e.g. ANYOF_POSIXL_SET */
14407 RExC_size = orig_size;
14411 RExC_emit = (regnode *)orig_emit;
14412 if (PL_regkind[op] == POSIXD) {
14413 if (op == POSIXL) {
14414 RExC_contains_locale = 1;
14417 op += NPOSIXD - POSIXD;
14422 ret = reg_node(pRExC_state, op);
14424 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14428 *flagp |= HASWIDTH|SIMPLE;
14430 else if (PL_regkind[op] == EXACT) {
14431 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14432 TRUE /* downgradable to EXACT */
14436 RExC_parse = (char *) cur_parse;
14438 SvREFCNT_dec(posixes);
14439 SvREFCNT_dec(nposixes);
14440 SvREFCNT_dec(cp_list);
14441 SvREFCNT_dec(cp_foldable_list);
14448 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14450 /* If folding, we calculate all characters that could fold to or from the
14451 * ones already on the list */
14452 if (cp_foldable_list) {
14454 UV start, end; /* End points of code point ranges */
14456 SV* fold_intersection = NULL;
14459 /* Our calculated list will be for Unicode rules. For locale
14460 * matching, we have to keep a separate list that is consulted at
14461 * runtime only when the locale indicates Unicode rules. For
14462 * non-locale, we just use to the general list */
14464 use_list = &only_utf8_locale_list;
14467 use_list = &cp_list;
14470 /* Only the characters in this class that participate in folds need
14471 * be checked. Get the intersection of this class and all the
14472 * possible characters that are foldable. This can quickly narrow
14473 * down a large class */
14474 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14475 &fold_intersection);
14477 /* The folds for all the Latin1 characters are hard-coded into this
14478 * program, but we have to go out to disk to get the others. */
14479 if (invlist_highest(cp_foldable_list) >= 256) {
14481 /* This is a hash that for a particular fold gives all
14482 * characters that are involved in it */
14483 if (! PL_utf8_foldclosures) {
14484 _load_PL_utf8_foldclosures();
14488 /* Now look at the foldable characters in this class individually */
14489 invlist_iterinit(fold_intersection);
14490 while (invlist_iternext(fold_intersection, &start, &end)) {
14493 /* Look at every character in the range */
14494 for (j = start; j <= end; j++) {
14495 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14501 if (IS_IN_SOME_FOLD_L1(j)) {
14503 /* ASCII is always matched; non-ASCII is matched
14504 * only under Unicode rules (which could happen
14505 * under /l if the locale is a UTF-8 one */
14506 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14507 *use_list = add_cp_to_invlist(*use_list,
14508 PL_fold_latin1[j]);
14512 add_cp_to_invlist(depends_list,
14513 PL_fold_latin1[j]);
14517 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14518 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14520 add_above_Latin1_folds(pRExC_state,
14527 /* Here is an above Latin1 character. We don't have the
14528 * rules hard-coded for it. First, get its fold. This is
14529 * the simple fold, as the multi-character folds have been
14530 * handled earlier and separated out */
14531 _to_uni_fold_flags(j, foldbuf, &foldlen,
14532 (ASCII_FOLD_RESTRICTED)
14533 ? FOLD_FLAGS_NOMIX_ASCII
14536 /* Single character fold of above Latin1. Add everything in
14537 * its fold closure to the list that this node should match.
14538 * The fold closures data structure is a hash with the keys
14539 * being the UTF-8 of every character that is folded to, like
14540 * 'k', and the values each an array of all code points that
14541 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14542 * Multi-character folds are not included */
14543 if ((listp = hv_fetch(PL_utf8_foldclosures,
14544 (char *) foldbuf, foldlen, FALSE)))
14546 AV* list = (AV*) *listp;
14548 for (k = 0; k <= av_tindex(list); k++) {
14549 SV** c_p = av_fetch(list, k, FALSE);
14555 /* /aa doesn't allow folds between ASCII and non- */
14556 if ((ASCII_FOLD_RESTRICTED
14557 && (isASCII(c) != isASCII(j))))
14562 /* Folds under /l which cross the 255/256 boundary
14563 * are added to a separate list. (These are valid
14564 * only when the locale is UTF-8.) */
14565 if (c < 256 && LOC) {
14566 *use_list = add_cp_to_invlist(*use_list, c);
14570 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14572 cp_list = add_cp_to_invlist(cp_list, c);
14575 /* Similarly folds involving non-ascii Latin1
14576 * characters under /d are added to their list */
14577 depends_list = add_cp_to_invlist(depends_list,
14584 SvREFCNT_dec_NN(fold_intersection);
14587 /* Now that we have finished adding all the folds, there is no reason
14588 * to keep the foldable list separate */
14589 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14590 SvREFCNT_dec_NN(cp_foldable_list);
14593 /* And combine the result (if any) with any inversion list from posix
14594 * classes. The lists are kept separate up to now because we don't want to
14595 * fold the classes (folding of those is automatically handled by the swash
14596 * fetching code) */
14597 if (posixes || nposixes) {
14598 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14599 /* Under /a and /aa, nothing above ASCII matches these */
14600 _invlist_intersection(posixes,
14601 PL_XPosix_ptrs[_CC_ASCII],
14605 if (DEPENDS_SEMANTICS) {
14606 /* Under /d, everything in the upper half of the Latin1 range
14607 * matches these complements */
14608 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14610 else if (AT_LEAST_ASCII_RESTRICTED) {
14611 /* Under /a and /aa, everything above ASCII matches these
14613 _invlist_union_complement_2nd(nposixes,
14614 PL_XPosix_ptrs[_CC_ASCII],
14618 _invlist_union(posixes, nposixes, &posixes);
14619 SvREFCNT_dec_NN(nposixes);
14622 posixes = nposixes;
14625 if (! DEPENDS_SEMANTICS) {
14627 _invlist_union(cp_list, posixes, &cp_list);
14628 SvREFCNT_dec_NN(posixes);
14635 /* Under /d, we put into a separate list the Latin1 things that
14636 * match only when the target string is utf8 */
14637 SV* nonascii_but_latin1_properties = NULL;
14638 _invlist_intersection(posixes, PL_UpperLatin1,
14639 &nonascii_but_latin1_properties);
14640 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14643 _invlist_union(cp_list, posixes, &cp_list);
14644 SvREFCNT_dec_NN(posixes);
14650 if (depends_list) {
14651 _invlist_union(depends_list, nonascii_but_latin1_properties,
14653 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14656 depends_list = nonascii_but_latin1_properties;
14661 /* And combine the result (if any) with any inversion list from properties.
14662 * The lists are kept separate up to now so that we can distinguish the two
14663 * in regards to matching above-Unicode. A run-time warning is generated
14664 * if a Unicode property is matched against a non-Unicode code point. But,
14665 * we allow user-defined properties to match anything, without any warning,
14666 * and we also suppress the warning if there is a portion of the character
14667 * class that isn't a Unicode property, and which matches above Unicode, \W
14668 * or [\x{110000}] for example.
14669 * (Note that in this case, unlike the Posix one above, there is no
14670 * <depends_list>, because having a Unicode property forces Unicode
14675 /* If it matters to the final outcome, see if a non-property
14676 * component of the class matches above Unicode. If so, the
14677 * warning gets suppressed. This is true even if just a single
14678 * such code point is specified, as though not strictly correct if
14679 * another such code point is matched against, the fact that they
14680 * are using above-Unicode code points indicates they should know
14681 * the issues involved */
14683 warn_super = ! (invert
14684 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14687 _invlist_union(properties, cp_list, &cp_list);
14688 SvREFCNT_dec_NN(properties);
14691 cp_list = properties;
14695 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14699 /* Here, we have calculated what code points should be in the character
14702 * Now we can see about various optimizations. Fold calculation (which we
14703 * did above) needs to take place before inversion. Otherwise /[^k]/i
14704 * would invert to include K, which under /i would match k, which it
14705 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14706 * folded until runtime */
14708 /* If we didn't do folding, it's because some information isn't available
14709 * until runtime; set the run-time fold flag for these. (We don't have to
14710 * worry about properties folding, as that is taken care of by the swash
14711 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14712 * locales, or the class matches at least one 0-255 range code point */
14714 if (only_utf8_locale_list) {
14715 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14717 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14720 invlist_iterinit(cp_list);
14721 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14722 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14724 invlist_iterfinish(cp_list);
14728 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14729 * at compile time. Besides not inverting folded locale now, we can't
14730 * invert if there are things such as \w, which aren't known until runtime
14734 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14736 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14738 _invlist_invert(cp_list);
14740 /* Any swash can't be used as-is, because we've inverted things */
14742 SvREFCNT_dec_NN(swash);
14746 /* Clear the invert flag since have just done it here */
14751 *ret_invlist = cp_list;
14752 SvREFCNT_dec(swash);
14754 /* Discard the generated node */
14756 RExC_size = orig_size;
14759 RExC_emit = orig_emit;
14764 /* Some character classes are equivalent to other nodes. Such nodes take
14765 * up less room and generally fewer operations to execute than ANYOF nodes.
14766 * Above, we checked for and optimized into some such equivalents for
14767 * certain common classes that are easy to test. Getting to this point in
14768 * the code means that the class didn't get optimized there. Since this
14769 * code is only executed in Pass 2, it is too late to save space--it has
14770 * been allocated in Pass 1, and currently isn't given back. But turning
14771 * things into an EXACTish node can allow the optimizer to join it to any
14772 * adjacent such nodes. And if the class is equivalent to things like /./,
14773 * expensive run-time swashes can be avoided. Now that we have more
14774 * complete information, we can find things necessarily missed by the
14775 * earlier code. I (khw) am not sure how much to look for here. It would
14776 * be easy, but perhaps too slow, to check any candidates against all the
14777 * node types they could possibly match using _invlistEQ(). */
14782 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14783 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14785 /* We don't optimize if we are supposed to make sure all non-Unicode
14786 * code points raise a warning, as only ANYOF nodes have this check.
14788 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14791 U8 op = END; /* The optimzation node-type */
14792 const char * cur_parse= RExC_parse;
14794 invlist_iterinit(cp_list);
14795 if (! invlist_iternext(cp_list, &start, &end)) {
14797 /* Here, the list is empty. This happens, for example, when a
14798 * Unicode property is the only thing in the character class, and
14799 * it doesn't match anything. (perluniprops.pod notes such
14802 *flagp |= HASWIDTH|SIMPLE;
14804 else if (start == end) { /* The range is a single code point */
14805 if (! invlist_iternext(cp_list, &start, &end)
14807 /* Don't do this optimization if it would require changing
14808 * the pattern to UTF-8 */
14809 && (start < 256 || UTF))
14811 /* Here, the list contains a single code point. Can optimize
14812 * into an EXACTish node */
14821 /* A locale node under folding with one code point can be
14822 * an EXACTFL, as its fold won't be calculated until
14828 /* Here, we are generally folding, but there is only one
14829 * code point to match. If we have to, we use an EXACT
14830 * node, but it would be better for joining with adjacent
14831 * nodes in the optimization pass if we used the same
14832 * EXACTFish node that any such are likely to be. We can
14833 * do this iff the code point doesn't participate in any
14834 * folds. For example, an EXACTF of a colon is the same as
14835 * an EXACT one, since nothing folds to or from a colon. */
14837 if (IS_IN_SOME_FOLD_L1(value)) {
14842 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14847 /* If we haven't found the node type, above, it means we
14848 * can use the prevailing one */
14850 op = compute_EXACTish(pRExC_state);
14855 else if (start == 0) {
14856 if (end == UV_MAX) {
14858 *flagp |= HASWIDTH|SIMPLE;
14861 else if (end == '\n' - 1
14862 && invlist_iternext(cp_list, &start, &end)
14863 && start == '\n' + 1 && end == UV_MAX)
14866 *flagp |= HASWIDTH|SIMPLE;
14870 invlist_iterfinish(cp_list);
14873 RExC_parse = (char *)orig_parse;
14874 RExC_emit = (regnode *)orig_emit;
14876 ret = reg_node(pRExC_state, op);
14878 RExC_parse = (char *)cur_parse;
14880 if (PL_regkind[op] == EXACT) {
14881 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14882 TRUE /* downgradable to EXACT */
14886 SvREFCNT_dec_NN(cp_list);
14891 /* Here, <cp_list> contains all the code points we can determine at
14892 * compile time that match under all conditions. Go through it, and
14893 * for things that belong in the bitmap, put them there, and delete from
14894 * <cp_list>. While we are at it, see if everything above 255 is in the
14895 * list, and if so, set a flag to speed up execution */
14897 populate_ANYOF_from_invlist(ret, &cp_list);
14900 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14903 /* Here, the bitmap has been populated with all the Latin1 code points that
14904 * always match. Can now add to the overall list those that match only
14905 * when the target string is UTF-8 (<depends_list>). */
14906 if (depends_list) {
14908 _invlist_union(cp_list, depends_list, &cp_list);
14909 SvREFCNT_dec_NN(depends_list);
14912 cp_list = depends_list;
14914 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14917 /* If there is a swash and more than one element, we can't use the swash in
14918 * the optimization below. */
14919 if (swash && element_count > 1) {
14920 SvREFCNT_dec_NN(swash);
14924 set_ANYOF_arg(pRExC_state, ret, cp_list,
14925 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14927 only_utf8_locale_list,
14928 swash, has_user_defined_property);
14930 *flagp |= HASWIDTH|SIMPLE;
14932 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14933 RExC_contains_locale = 1;
14939 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14942 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14943 regnode* const node,
14945 SV* const runtime_defns,
14946 SV* const only_utf8_locale_list,
14948 const bool has_user_defined_property)
14950 /* Sets the arg field of an ANYOF-type node 'node', using information about
14951 * the node passed-in. If there is nothing outside the node's bitmap, the
14952 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14953 * the count returned by add_data(), having allocated and stored an array,
14954 * av, that that count references, as follows:
14955 * av[0] stores the character class description in its textual form.
14956 * This is used later (regexec.c:Perl_regclass_swash()) to
14957 * initialize the appropriate swash, and is also useful for dumping
14958 * the regnode. This is set to &PL_sv_undef if the textual
14959 * description is not needed at run-time (as happens if the other
14960 * elements completely define the class)
14961 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14962 * computed from av[0]. But if no further computation need be done,
14963 * the swash is stored here now (and av[0] is &PL_sv_undef).
14964 * av[2] stores the inversion list of code points that match only if the
14965 * current locale is UTF-8
14966 * av[3] stores the cp_list inversion list for use in addition or instead
14967 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14968 * (Otherwise everything needed is already in av[0] and av[1])
14969 * av[4] is set if any component of the class is from a user-defined
14970 * property; used only if av[3] exists */
14974 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14976 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14977 assert(! (ANYOF_FLAGS(node)
14978 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14979 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14982 AV * const av = newAV();
14985 assert(ANYOF_FLAGS(node)
14986 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14988 av_store(av, 0, (runtime_defns)
14989 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
14992 av_store(av, 1, swash);
14993 SvREFCNT_dec_NN(cp_list);
14996 av_store(av, 1, &PL_sv_undef);
14998 av_store(av, 3, cp_list);
14999 av_store(av, 4, newSVuv(has_user_defined_property));
15003 if (only_utf8_locale_list) {
15004 av_store(av, 2, only_utf8_locale_list);
15007 av_store(av, 2, &PL_sv_undef);
15010 rv = newRV_noinc(MUTABLE_SV(av));
15011 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15012 RExC_rxi->data->data[n] = (void*)rv;
15018 /* reg_skipcomment()
15020 Absorbs an /x style # comment from the input stream,
15021 returning a pointer to the first character beyond the comment, or if the
15022 comment terminates the pattern without anything following it, this returns
15023 one past the final character of the pattern (in other words, RExC_end) and
15024 sets the REG_RUN_ON_COMMENT_SEEN flag.
15026 Note it's the callers responsibility to ensure that we are
15027 actually in /x mode
15031 PERL_STATIC_INLINE char*
15032 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15034 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15038 while (p < RExC_end) {
15039 if (*(++p) == '\n') {
15044 /* we ran off the end of the pattern without ending the comment, so we have
15045 * to add an \n when wrapping */
15046 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15052 Advances the parse position, and optionally absorbs
15053 "whitespace" from the inputstream.
15055 Without /x "whitespace" means (?#...) style comments only,
15056 with /x this means (?#...) and # comments and whitespace proper.
15058 Returns the RExC_parse point from BEFORE the scan occurs.
15060 This is the /x friendly way of saying RExC_parse++.
15064 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15066 char* const retval = RExC_parse++;
15068 PERL_ARGS_ASSERT_NEXTCHAR;
15071 if (RExC_end - RExC_parse >= 3
15072 && *RExC_parse == '('
15073 && RExC_parse[1] == '?'
15074 && RExC_parse[2] == '#')
15076 while (*RExC_parse != ')') {
15077 if (RExC_parse == RExC_end)
15078 FAIL("Sequence (?#... not terminated");
15084 if (RExC_flags & RXf_PMf_EXTENDED) {
15085 char * p = regpatws(pRExC_state, RExC_parse,
15086 TRUE); /* means recognize comments */
15087 if (p != RExC_parse) {
15097 - reg_node - emit a node
15099 STATIC regnode * /* Location. */
15100 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15103 regnode * const ret = RExC_emit;
15104 GET_RE_DEBUG_FLAGS_DECL;
15106 PERL_ARGS_ASSERT_REG_NODE;
15109 SIZE_ALIGN(RExC_size);
15113 if (RExC_emit >= RExC_emit_bound)
15114 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15115 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15117 NODE_ALIGN_FILL(ret);
15119 FILL_ADVANCE_NODE(ptr, op);
15120 #ifdef RE_TRACK_PATTERN_OFFSETS
15121 if (RExC_offsets) { /* MJD */
15123 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15124 "reg_node", __LINE__,
15126 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15127 ? "Overwriting end of array!\n" : "OK",
15128 (UV)(RExC_emit - RExC_emit_start),
15129 (UV)(RExC_parse - RExC_start),
15130 (UV)RExC_offsets[0]));
15131 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15139 - reganode - emit a node with an argument
15141 STATIC regnode * /* Location. */
15142 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15145 regnode * const ret = RExC_emit;
15146 GET_RE_DEBUG_FLAGS_DECL;
15148 PERL_ARGS_ASSERT_REGANODE;
15151 SIZE_ALIGN(RExC_size);
15156 assert(2==regarglen[op]+1);
15158 Anything larger than this has to allocate the extra amount.
15159 If we changed this to be:
15161 RExC_size += (1 + regarglen[op]);
15163 then it wouldn't matter. Its not clear what side effect
15164 might come from that so its not done so far.
15169 if (RExC_emit >= RExC_emit_bound)
15170 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15171 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15173 NODE_ALIGN_FILL(ret);
15175 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15176 #ifdef RE_TRACK_PATTERN_OFFSETS
15177 if (RExC_offsets) { /* MJD */
15179 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15183 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15184 "Overwriting end of array!\n" : "OK",
15185 (UV)(RExC_emit - RExC_emit_start),
15186 (UV)(RExC_parse - RExC_start),
15187 (UV)RExC_offsets[0]));
15188 Set_Cur_Node_Offset;
15196 - reguni - emit (if appropriate) a Unicode character
15198 PERL_STATIC_INLINE STRLEN
15199 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15201 PERL_ARGS_ASSERT_REGUNI;
15203 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15207 - reginsert - insert an operator in front of already-emitted operand
15209 * Means relocating the operand.
15212 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15217 const int offset = regarglen[(U8)op];
15218 const int size = NODE_STEP_REGNODE + offset;
15219 GET_RE_DEBUG_FLAGS_DECL;
15221 PERL_ARGS_ASSERT_REGINSERT;
15222 PERL_UNUSED_CONTEXT;
15223 PERL_UNUSED_ARG(depth);
15224 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15225 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15234 if (RExC_open_parens) {
15236 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15237 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15238 if ( RExC_open_parens[paren] >= opnd ) {
15239 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15240 RExC_open_parens[paren] += size;
15242 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15244 if ( RExC_close_parens[paren] >= opnd ) {
15245 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15246 RExC_close_parens[paren] += size;
15248 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15253 while (src > opnd) {
15254 StructCopy(--src, --dst, regnode);
15255 #ifdef RE_TRACK_PATTERN_OFFSETS
15256 if (RExC_offsets) { /* MJD 20010112 */
15258 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15262 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15263 ? "Overwriting end of array!\n" : "OK",
15264 (UV)(src - RExC_emit_start),
15265 (UV)(dst - RExC_emit_start),
15266 (UV)RExC_offsets[0]));
15267 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15268 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15274 place = opnd; /* Op node, where operand used to be. */
15275 #ifdef RE_TRACK_PATTERN_OFFSETS
15276 if (RExC_offsets) { /* MJD */
15278 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15282 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15283 ? "Overwriting end of array!\n" : "OK",
15284 (UV)(place - RExC_emit_start),
15285 (UV)(RExC_parse - RExC_start),
15286 (UV)RExC_offsets[0]));
15287 Set_Node_Offset(place, RExC_parse);
15288 Set_Node_Length(place, 1);
15291 src = NEXTOPER(place);
15292 FILL_ADVANCE_NODE(place, op);
15293 Zero(src, offset, regnode);
15297 - regtail - set the next-pointer at the end of a node chain of p to val.
15298 - SEE ALSO: regtail_study
15300 /* TODO: All three parms should be const */
15302 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15303 const regnode *val,U32 depth)
15306 GET_RE_DEBUG_FLAGS_DECL;
15308 PERL_ARGS_ASSERT_REGTAIL;
15310 PERL_UNUSED_ARG(depth);
15316 /* Find last node. */
15319 regnode * const temp = regnext(scan);
15321 SV * const mysv=sv_newmortal();
15322 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15323 regprop(RExC_rx, mysv, scan, NULL);
15324 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15325 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15326 (temp == NULL ? "->" : ""),
15327 (temp == NULL ? PL_reg_name[OP(val)] : "")
15335 if (reg_off_by_arg[OP(scan)]) {
15336 ARG_SET(scan, val - scan);
15339 NEXT_OFF(scan) = val - scan;
15345 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15346 - Look for optimizable sequences at the same time.
15347 - currently only looks for EXACT chains.
15349 This is experimental code. The idea is to use this routine to perform
15350 in place optimizations on branches and groups as they are constructed,
15351 with the long term intention of removing optimization from study_chunk so
15352 that it is purely analytical.
15354 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15355 to control which is which.
15358 /* TODO: All four parms should be const */
15361 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15362 const regnode *val,U32 depth)
15366 #ifdef EXPERIMENTAL_INPLACESCAN
15369 GET_RE_DEBUG_FLAGS_DECL;
15371 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15377 /* Find last node. */
15381 regnode * const temp = regnext(scan);
15382 #ifdef EXPERIMENTAL_INPLACESCAN
15383 if (PL_regkind[OP(scan)] == EXACT) {
15384 bool unfolded_multi_char; /* Unexamined in this routine */
15385 if (join_exact(pRExC_state, scan, &min,
15386 &unfolded_multi_char, 1, val, depth+1))
15391 switch (OP(scan)) {
15394 case EXACTFA_NO_TRIE:
15399 if( exact == PSEUDO )
15401 else if ( exact != OP(scan) )
15410 SV * const mysv=sv_newmortal();
15411 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15412 regprop(RExC_rx, mysv, scan, NULL);
15413 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15414 SvPV_nolen_const(mysv),
15415 REG_NODE_NUM(scan),
15416 PL_reg_name[exact]);
15423 SV * const mysv_val=sv_newmortal();
15424 DEBUG_PARSE_MSG("");
15425 regprop(RExC_rx, mysv_val, val, NULL);
15426 PerlIO_printf(Perl_debug_log,
15427 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15428 SvPV_nolen_const(mysv_val),
15429 (IV)REG_NODE_NUM(val),
15433 if (reg_off_by_arg[OP(scan)]) {
15434 ARG_SET(scan, val - scan);
15437 NEXT_OFF(scan) = val - scan;
15445 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15450 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15455 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15457 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15458 if (flags & (1<<bit)) {
15459 if (!set++ && lead)
15460 PerlIO_printf(Perl_debug_log, "%s",lead);
15461 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15466 PerlIO_printf(Perl_debug_log, "\n");
15468 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15473 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15479 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15481 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15482 if (flags & (1<<bit)) {
15483 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15486 if (!set++ && lead)
15487 PerlIO_printf(Perl_debug_log, "%s",lead);
15488 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15491 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15492 if (!set++ && lead) {
15493 PerlIO_printf(Perl_debug_log, "%s",lead);
15496 case REGEX_UNICODE_CHARSET:
15497 PerlIO_printf(Perl_debug_log, "UNICODE");
15499 case REGEX_LOCALE_CHARSET:
15500 PerlIO_printf(Perl_debug_log, "LOCALE");
15502 case REGEX_ASCII_RESTRICTED_CHARSET:
15503 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15505 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15506 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15509 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15515 PerlIO_printf(Perl_debug_log, "\n");
15517 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15523 Perl_regdump(pTHX_ const regexp *r)
15526 SV * const sv = sv_newmortal();
15527 SV *dsv= sv_newmortal();
15528 RXi_GET_DECL(r,ri);
15529 GET_RE_DEBUG_FLAGS_DECL;
15531 PERL_ARGS_ASSERT_REGDUMP;
15533 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15535 /* Header fields of interest. */
15536 if (r->anchored_substr) {
15537 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15538 RE_SV_DUMPLEN(r->anchored_substr), 30);
15539 PerlIO_printf(Perl_debug_log,
15540 "anchored %s%s at %"IVdf" ",
15541 s, RE_SV_TAIL(r->anchored_substr),
15542 (IV)r->anchored_offset);
15543 } else if (r->anchored_utf8) {
15544 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15545 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15546 PerlIO_printf(Perl_debug_log,
15547 "anchored utf8 %s%s at %"IVdf" ",
15548 s, RE_SV_TAIL(r->anchored_utf8),
15549 (IV)r->anchored_offset);
15551 if (r->float_substr) {
15552 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15553 RE_SV_DUMPLEN(r->float_substr), 30);
15554 PerlIO_printf(Perl_debug_log,
15555 "floating %s%s at %"IVdf"..%"UVuf" ",
15556 s, RE_SV_TAIL(r->float_substr),
15557 (IV)r->float_min_offset, (UV)r->float_max_offset);
15558 } else if (r->float_utf8) {
15559 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15560 RE_SV_DUMPLEN(r->float_utf8), 30);
15561 PerlIO_printf(Perl_debug_log,
15562 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15563 s, RE_SV_TAIL(r->float_utf8),
15564 (IV)r->float_min_offset, (UV)r->float_max_offset);
15566 if (r->check_substr || r->check_utf8)
15567 PerlIO_printf(Perl_debug_log,
15569 (r->check_substr == r->float_substr
15570 && r->check_utf8 == r->float_utf8
15571 ? "(checking floating" : "(checking anchored"));
15572 if (r->intflags & PREGf_NOSCAN)
15573 PerlIO_printf(Perl_debug_log, " noscan");
15574 if (r->extflags & RXf_CHECK_ALL)
15575 PerlIO_printf(Perl_debug_log, " isall");
15576 if (r->check_substr || r->check_utf8)
15577 PerlIO_printf(Perl_debug_log, ") ");
15579 if (ri->regstclass) {
15580 regprop(r, sv, ri->regstclass, NULL);
15581 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15583 if (r->intflags & PREGf_ANCH) {
15584 PerlIO_printf(Perl_debug_log, "anchored");
15585 if (r->intflags & PREGf_ANCH_BOL)
15586 PerlIO_printf(Perl_debug_log, "(BOL)");
15587 if (r->intflags & PREGf_ANCH_MBOL)
15588 PerlIO_printf(Perl_debug_log, "(MBOL)");
15589 if (r->intflags & PREGf_ANCH_SBOL)
15590 PerlIO_printf(Perl_debug_log, "(SBOL)");
15591 if (r->intflags & PREGf_ANCH_GPOS)
15592 PerlIO_printf(Perl_debug_log, "(GPOS)");
15593 PerlIO_putc(Perl_debug_log, ' ');
15595 if (r->intflags & PREGf_GPOS_SEEN)
15596 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15597 if (r->intflags & PREGf_SKIP)
15598 PerlIO_printf(Perl_debug_log, "plus ");
15599 if (r->intflags & PREGf_IMPLICIT)
15600 PerlIO_printf(Perl_debug_log, "implicit ");
15601 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15602 if (r->extflags & RXf_EVAL_SEEN)
15603 PerlIO_printf(Perl_debug_log, "with eval ");
15604 PerlIO_printf(Perl_debug_log, "\n");
15606 regdump_extflags("r->extflags: ",r->extflags);
15607 regdump_intflags("r->intflags: ",r->intflags);
15610 PERL_ARGS_ASSERT_REGDUMP;
15611 PERL_UNUSED_CONTEXT;
15612 PERL_UNUSED_ARG(r);
15613 #endif /* DEBUGGING */
15617 - regprop - printable representation of opcode, with run time support
15621 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15626 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15627 static const char * const anyofs[] = {
15628 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15629 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15630 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15631 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15632 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15633 || _CC_VERTSPACE != 16
15634 #error Need to adjust order of anyofs[]
15671 RXi_GET_DECL(prog,progi);
15672 GET_RE_DEBUG_FLAGS_DECL;
15674 PERL_ARGS_ASSERT_REGPROP;
15678 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15679 /* It would be nice to FAIL() here, but this may be called from
15680 regexec.c, and it would be hard to supply pRExC_state. */
15681 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15682 (int)OP(o), (int)REGNODE_MAX);
15683 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15685 k = PL_regkind[OP(o)];
15688 sv_catpvs(sv, " ");
15689 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15690 * is a crude hack but it may be the best for now since
15691 * we have no flag "this EXACTish node was UTF-8"
15693 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15694 PERL_PV_ESCAPE_UNI_DETECT |
15695 PERL_PV_ESCAPE_NONASCII |
15696 PERL_PV_PRETTY_ELLIPSES |
15697 PERL_PV_PRETTY_LTGT |
15698 PERL_PV_PRETTY_NOCLEAR
15700 } else if (k == TRIE) {
15701 /* print the details of the trie in dumpuntil instead, as
15702 * progi->data isn't available here */
15703 const char op = OP(o);
15704 const U32 n = ARG(o);
15705 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15706 (reg_ac_data *)progi->data->data[n] :
15708 const reg_trie_data * const trie
15709 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15711 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15712 DEBUG_TRIE_COMPILE_r(
15713 Perl_sv_catpvf(aTHX_ sv,
15714 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15715 (UV)trie->startstate,
15716 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15717 (UV)trie->wordcount,
15720 (UV)TRIE_CHARCOUNT(trie),
15721 (UV)trie->uniquecharcount
15724 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15725 sv_catpvs(sv, "[");
15726 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15728 : TRIE_BITMAP(trie));
15729 sv_catpvs(sv, "]");
15732 } else if (k == CURLY) {
15733 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15734 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15735 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15737 else if (k == WHILEM && o->flags) /* Ordinal/of */
15738 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15739 else if (k == REF || k == OPEN || k == CLOSE
15740 || k == GROUPP || OP(o)==ACCEPT)
15742 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15743 if ( RXp_PAREN_NAMES(prog) ) {
15744 if ( k != REF || (OP(o) < NREF)) {
15745 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15746 SV **name= av_fetch(list, ARG(o), 0 );
15748 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15751 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15752 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15753 I32 *nums=(I32*)SvPVX(sv_dat);
15754 SV **name= av_fetch(list, nums[0], 0 );
15757 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15758 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15759 (n ? "," : ""), (IV)nums[n]);
15761 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15765 if ( k == REF && reginfo) {
15766 U32 n = ARG(o); /* which paren pair */
15767 I32 ln = prog->offs[n].start;
15768 if (prog->lastparen < n || ln == -1)
15769 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15770 else if (ln == prog->offs[n].end)
15771 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15773 const char *s = reginfo->strbeg + ln;
15774 Perl_sv_catpvf(aTHX_ sv, ": ");
15775 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15776 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15779 } else if (k == GOSUB)
15780 /* Paren and offset */
15781 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15782 else if (k == VERB) {
15784 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15785 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15786 } else if (k == LOGICAL)
15787 /* 2: embedded, otherwise 1 */
15788 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15789 else if (k == ANYOF) {
15790 const U8 flags = ANYOF_FLAGS(o);
15794 if (flags & ANYOF_LOCALE_FLAGS)
15795 sv_catpvs(sv, "{loc}");
15796 if (flags & ANYOF_LOC_FOLD)
15797 sv_catpvs(sv, "{i}");
15798 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15799 if (flags & ANYOF_INVERT)
15800 sv_catpvs(sv, "^");
15802 /* output what the standard cp 0-255 bitmap matches */
15803 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15805 /* output any special charclass tests (used entirely under use
15807 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15809 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15810 if (ANYOF_POSIXL_TEST(o,i)) {
15811 sv_catpv(sv, anyofs[i]);
15817 if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15819 |ANYOF_NONBITMAP_NON_UTF8
15823 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15824 if (flags & ANYOF_INVERT)
15825 /*make sure the invert info is in each */
15826 sv_catpvs(sv, "^");
15829 if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15830 sv_catpvs(sv, "{non-utf8-latin1-all}");
15833 /* output information about the unicode matching */
15834 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15835 sv_catpvs(sv, "{unicode_all}");
15836 else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15837 SV *lv; /* Set if there is something outside the bit map. */
15838 bool byte_output = FALSE; /* If something in the bitmap has
15840 SV *only_utf8_locale;
15842 /* Get the stuff that wasn't in the bitmap */
15843 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15844 &lv, &only_utf8_locale);
15845 if (lv && lv != &PL_sv_undef) {
15846 char *s = savesvpv(lv);
15847 char * const origs = s;
15849 while (*s && *s != '\n')
15853 const char * const t = ++s;
15855 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15856 sv_catpvs(sv, "{outside bitmap}");
15859 sv_catpvs(sv, "{utf8}");
15863 sv_catpvs(sv, " ");
15869 /* Truncate very long output */
15870 if (s - origs > 256) {
15871 Perl_sv_catpvf(aTHX_ sv,
15873 (int) (s - origs - 1),
15879 else if (*s == '\t') {
15893 SvREFCNT_dec_NN(lv);
15896 if ((flags & ANYOF_LOC_FOLD)
15897 && only_utf8_locale
15898 && only_utf8_locale != &PL_sv_undef)
15901 int max_entries = 256;
15903 sv_catpvs(sv, "{utf8 locale}");
15904 invlist_iterinit(only_utf8_locale);
15905 while (invlist_iternext(only_utf8_locale,
15907 put_range(sv, start, end);
15909 if (max_entries < 0) {
15910 sv_catpvs(sv, "...");
15914 invlist_iterfinish(only_utf8_locale);
15919 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15921 else if (k == POSIXD || k == NPOSIXD) {
15922 U8 index = FLAGS(o) * 2;
15923 if (index < C_ARRAY_LENGTH(anyofs)) {
15924 if (*anyofs[index] != '[') {
15927 sv_catpv(sv, anyofs[index]);
15928 if (*anyofs[index] != '[') {
15933 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15936 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15937 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15939 PERL_UNUSED_CONTEXT;
15940 PERL_UNUSED_ARG(sv);
15941 PERL_UNUSED_ARG(o);
15942 PERL_UNUSED_ARG(prog);
15943 PERL_UNUSED_ARG(reginfo);
15944 #endif /* DEBUGGING */
15950 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15951 { /* Assume that RE_INTUIT is set */
15952 struct regexp *const prog = ReANY(r);
15953 GET_RE_DEBUG_FLAGS_DECL;
15955 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15956 PERL_UNUSED_CONTEXT;
15960 const char * const s = SvPV_nolen_const(prog->check_substr
15961 ? prog->check_substr : prog->check_utf8);
15963 if (!PL_colorset) reginitcolors();
15964 PerlIO_printf(Perl_debug_log,
15965 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15967 prog->check_substr ? "" : "utf8 ",
15968 PL_colors[5],PL_colors[0],
15971 (strlen(s) > 60 ? "..." : ""));
15974 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15980 handles refcounting and freeing the perl core regexp structure. When
15981 it is necessary to actually free the structure the first thing it
15982 does is call the 'free' method of the regexp_engine associated to
15983 the regexp, allowing the handling of the void *pprivate; member
15984 first. (This routine is not overridable by extensions, which is why
15985 the extensions free is called first.)
15987 See regdupe and regdupe_internal if you change anything here.
15989 #ifndef PERL_IN_XSUB_RE
15991 Perl_pregfree(pTHX_ REGEXP *r)
15997 Perl_pregfree2(pTHX_ REGEXP *rx)
15999 struct regexp *const r = ReANY(rx);
16000 GET_RE_DEBUG_FLAGS_DECL;
16002 PERL_ARGS_ASSERT_PREGFREE2;
16004 if (r->mother_re) {
16005 ReREFCNT_dec(r->mother_re);
16007 CALLREGFREE_PVT(rx); /* free the private data */
16008 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16009 Safefree(r->xpv_len_u.xpvlenu_pv);
16012 SvREFCNT_dec(r->anchored_substr);
16013 SvREFCNT_dec(r->anchored_utf8);
16014 SvREFCNT_dec(r->float_substr);
16015 SvREFCNT_dec(r->float_utf8);
16016 Safefree(r->substrs);
16018 RX_MATCH_COPY_FREE(rx);
16019 #ifdef PERL_ANY_COW
16020 SvREFCNT_dec(r->saved_copy);
16023 SvREFCNT_dec(r->qr_anoncv);
16024 rx->sv_u.svu_rx = 0;
16029 This is a hacky workaround to the structural issue of match results
16030 being stored in the regexp structure which is in turn stored in
16031 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16032 could be PL_curpm in multiple contexts, and could require multiple
16033 result sets being associated with the pattern simultaneously, such
16034 as when doing a recursive match with (??{$qr})
16036 The solution is to make a lightweight copy of the regexp structure
16037 when a qr// is returned from the code executed by (??{$qr}) this
16038 lightweight copy doesn't actually own any of its data except for
16039 the starp/end and the actual regexp structure itself.
16045 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16047 struct regexp *ret;
16048 struct regexp *const r = ReANY(rx);
16049 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16051 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16054 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16056 SvOK_off((SV *)ret_x);
16058 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16059 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16060 made both spots point to the same regexp body.) */
16061 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16062 assert(!SvPVX(ret_x));
16063 ret_x->sv_u.svu_rx = temp->sv_any;
16064 temp->sv_any = NULL;
16065 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16066 SvREFCNT_dec_NN(temp);
16067 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16068 ing below will not set it. */
16069 SvCUR_set(ret_x, SvCUR(rx));
16072 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16073 sv_force_normal(sv) is called. */
16075 ret = ReANY(ret_x);
16077 SvFLAGS(ret_x) |= SvUTF8(rx);
16078 /* We share the same string buffer as the original regexp, on which we
16079 hold a reference count, incremented when mother_re is set below.
16080 The string pointer is copied here, being part of the regexp struct.
16082 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16083 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16085 const I32 npar = r->nparens+1;
16086 Newx(ret->offs, npar, regexp_paren_pair);
16087 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16090 Newx(ret->substrs, 1, struct reg_substr_data);
16091 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16093 SvREFCNT_inc_void(ret->anchored_substr);
16094 SvREFCNT_inc_void(ret->anchored_utf8);
16095 SvREFCNT_inc_void(ret->float_substr);
16096 SvREFCNT_inc_void(ret->float_utf8);
16098 /* check_substr and check_utf8, if non-NULL, point to either their
16099 anchored or float namesakes, and don't hold a second reference. */
16101 RX_MATCH_COPIED_off(ret_x);
16102 #ifdef PERL_ANY_COW
16103 ret->saved_copy = NULL;
16105 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16106 SvREFCNT_inc_void(ret->qr_anoncv);
16112 /* regfree_internal()
16114 Free the private data in a regexp. This is overloadable by
16115 extensions. Perl takes care of the regexp structure in pregfree(),
16116 this covers the *pprivate pointer which technically perl doesn't
16117 know about, however of course we have to handle the
16118 regexp_internal structure when no extension is in use.
16120 Note this is called before freeing anything in the regexp
16125 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16127 struct regexp *const r = ReANY(rx);
16128 RXi_GET_DECL(r,ri);
16129 GET_RE_DEBUG_FLAGS_DECL;
16131 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16137 SV *dsv= sv_newmortal();
16138 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16139 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16140 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16141 PL_colors[4],PL_colors[5],s);
16144 #ifdef RE_TRACK_PATTERN_OFFSETS
16146 Safefree(ri->u.offsets); /* 20010421 MJD */
16148 if (ri->code_blocks) {
16150 for (n = 0; n < ri->num_code_blocks; n++)
16151 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16152 Safefree(ri->code_blocks);
16156 int n = ri->data->count;
16159 /* If you add a ->what type here, update the comment in regcomp.h */
16160 switch (ri->data->what[n]) {
16166 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16169 Safefree(ri->data->data[n]);
16175 { /* Aho Corasick add-on structure for a trie node.
16176 Used in stclass optimization only */
16178 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16179 #ifdef USE_ITHREADS
16183 refcount = --aho->refcount;
16186 PerlMemShared_free(aho->states);
16187 PerlMemShared_free(aho->fail);
16188 /* do this last!!!! */
16189 PerlMemShared_free(ri->data->data[n]);
16190 /* we should only ever get called once, so
16191 * assert as much, and also guard the free
16192 * which /might/ happen twice. At the least
16193 * it will make code anlyzers happy and it
16194 * doesn't cost much. - Yves */
16195 assert(ri->regstclass);
16196 if (ri->regstclass) {
16197 PerlMemShared_free(ri->regstclass);
16198 ri->regstclass = 0;
16205 /* trie structure. */
16207 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16208 #ifdef USE_ITHREADS
16212 refcount = --trie->refcount;
16215 PerlMemShared_free(trie->charmap);
16216 PerlMemShared_free(trie->states);
16217 PerlMemShared_free(trie->trans);
16219 PerlMemShared_free(trie->bitmap);
16221 PerlMemShared_free(trie->jump);
16222 PerlMemShared_free(trie->wordinfo);
16223 /* do this last!!!! */
16224 PerlMemShared_free(ri->data->data[n]);
16229 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16230 ri->data->what[n]);
16233 Safefree(ri->data->what);
16234 Safefree(ri->data);
16240 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16241 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16242 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16245 re_dup - duplicate a regexp.
16247 This routine is expected to clone a given regexp structure. It is only
16248 compiled under USE_ITHREADS.
16250 After all of the core data stored in struct regexp is duplicated
16251 the regexp_engine.dupe method is used to copy any private data
16252 stored in the *pprivate pointer. This allows extensions to handle
16253 any duplication it needs to do.
16255 See pregfree() and regfree_internal() if you change anything here.
16257 #if defined(USE_ITHREADS)
16258 #ifndef PERL_IN_XSUB_RE
16260 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16264 const struct regexp *r = ReANY(sstr);
16265 struct regexp *ret = ReANY(dstr);
16267 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16269 npar = r->nparens+1;
16270 Newx(ret->offs, npar, regexp_paren_pair);
16271 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16273 if (ret->substrs) {
16274 /* Do it this way to avoid reading from *r after the StructCopy().
16275 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16276 cache, it doesn't matter. */
16277 const bool anchored = r->check_substr
16278 ? r->check_substr == r->anchored_substr
16279 : r->check_utf8 == r->anchored_utf8;
16280 Newx(ret->substrs, 1, struct reg_substr_data);
16281 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16283 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16284 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16285 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16286 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16288 /* check_substr and check_utf8, if non-NULL, point to either their
16289 anchored or float namesakes, and don't hold a second reference. */
16291 if (ret->check_substr) {
16293 assert(r->check_utf8 == r->anchored_utf8);
16294 ret->check_substr = ret->anchored_substr;
16295 ret->check_utf8 = ret->anchored_utf8;
16297 assert(r->check_substr == r->float_substr);
16298 assert(r->check_utf8 == r->float_utf8);
16299 ret->check_substr = ret->float_substr;
16300 ret->check_utf8 = ret->float_utf8;
16302 } else if (ret->check_utf8) {
16304 ret->check_utf8 = ret->anchored_utf8;
16306 ret->check_utf8 = ret->float_utf8;
16311 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16312 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16315 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16317 if (RX_MATCH_COPIED(dstr))
16318 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16320 ret->subbeg = NULL;
16321 #ifdef PERL_ANY_COW
16322 ret->saved_copy = NULL;
16325 /* Whether mother_re be set or no, we need to copy the string. We
16326 cannot refrain from copying it when the storage points directly to
16327 our mother regexp, because that's
16328 1: a buffer in a different thread
16329 2: something we no longer hold a reference on
16330 so we need to copy it locally. */
16331 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16332 ret->mother_re = NULL;
16334 #endif /* PERL_IN_XSUB_RE */
16339 This is the internal complement to regdupe() which is used to copy
16340 the structure pointed to by the *pprivate pointer in the regexp.
16341 This is the core version of the extension overridable cloning hook.
16342 The regexp structure being duplicated will be copied by perl prior
16343 to this and will be provided as the regexp *r argument, however
16344 with the /old/ structures pprivate pointer value. Thus this routine
16345 may override any copying normally done by perl.
16347 It returns a pointer to the new regexp_internal structure.
16351 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16354 struct regexp *const r = ReANY(rx);
16355 regexp_internal *reti;
16357 RXi_GET_DECL(r,ri);
16359 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16363 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16364 char, regexp_internal);
16365 Copy(ri->program, reti->program, len+1, regnode);
16367 reti->num_code_blocks = ri->num_code_blocks;
16368 if (ri->code_blocks) {
16370 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16371 struct reg_code_block);
16372 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16373 struct reg_code_block);
16374 for (n = 0; n < ri->num_code_blocks; n++)
16375 reti->code_blocks[n].src_regex = (REGEXP*)
16376 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16379 reti->code_blocks = NULL;
16381 reti->regstclass = NULL;
16384 struct reg_data *d;
16385 const int count = ri->data->count;
16388 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16389 char, struct reg_data);
16390 Newx(d->what, count, U8);
16393 for (i = 0; i < count; i++) {
16394 d->what[i] = ri->data->what[i];
16395 switch (d->what[i]) {
16396 /* see also regcomp.h and regfree_internal() */
16397 case 'a': /* actually an AV, but the dup function is identical. */
16401 case 'u': /* actually an HV, but the dup function is identical. */
16402 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16405 /* This is cheating. */
16406 Newx(d->data[i], 1, regnode_ssc);
16407 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16408 reti->regstclass = (regnode*)d->data[i];
16411 /* Trie stclasses are readonly and can thus be shared
16412 * without duplication. We free the stclass in pregfree
16413 * when the corresponding reg_ac_data struct is freed.
16415 reti->regstclass= ri->regstclass;
16419 ((reg_trie_data*)ri->data->data[i])->refcount++;
16424 d->data[i] = ri->data->data[i];
16427 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16428 ri->data->what[i]);
16437 reti->name_list_idx = ri->name_list_idx;
16439 #ifdef RE_TRACK_PATTERN_OFFSETS
16440 if (ri->u.offsets) {
16441 Newx(reti->u.offsets, 2*len+1, U32);
16442 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16445 SetProgLen(reti,len);
16448 return (void*)reti;
16451 #endif /* USE_ITHREADS */
16453 #ifndef PERL_IN_XSUB_RE
16456 - regnext - dig the "next" pointer out of a node
16459 Perl_regnext(pTHX_ regnode *p)
16466 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16467 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16468 (int)OP(p), (int)REGNODE_MAX);
16471 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16480 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16483 STRLEN l1 = strlen(pat1);
16484 STRLEN l2 = strlen(pat2);
16487 const char *message;
16489 PERL_ARGS_ASSERT_RE_CROAK2;
16495 Copy(pat1, buf, l1 , char);
16496 Copy(pat2, buf + l1, l2 , char);
16497 buf[l1 + l2] = '\n';
16498 buf[l1 + l2 + 1] = '\0';
16499 va_start(args, pat2);
16500 msv = vmess(buf, &args);
16502 message = SvPV_const(msv,l1);
16505 Copy(message, buf, l1 , char);
16506 /* l1-1 to avoid \n */
16507 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16510 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16512 #ifndef PERL_IN_XSUB_RE
16514 Perl_save_re_context(pTHX)
16516 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16518 const REGEXP * const rx = PM_GETRE(PL_curpm);
16521 for (i = 1; i <= RX_NPARENS(rx); i++) {
16522 char digits[TYPE_CHARS(long)];
16523 const STRLEN len = my_snprintf(digits, sizeof(digits),
16525 GV *const *const gvp
16526 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16529 GV * const gv = *gvp;
16530 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16542 S_put_byte(pTHX_ SV *sv, int c)
16544 PERL_ARGS_ASSERT_PUT_BYTE;
16548 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16549 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16550 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16551 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16552 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16555 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16560 const char string = c;
16561 if (c == '-' || c == ']' || c == '\\' || c == '^')
16562 sv_catpvs(sv, "\\");
16563 sv_catpvn(sv, &string, 1);
16568 S_put_range(pTHX_ SV *sv, UV start, UV end)
16571 /* Appends to 'sv' a displayable version of the range of code points from
16572 * 'start' to 'end'. It assumes that only ASCII printables are displayable
16573 * as-is (though some of these will be escaped by put_byte()). For the
16574 * time being, this subroutine only works for latin1 (< 256) code points */
16576 assert(start <= end);
16578 PERL_ARGS_ASSERT_PUT_RANGE;
16580 while (start <= end) {
16581 if (end - start < 3) { /* Individual chars in short ranges */
16582 for (; start <= end; start++) {
16583 put_byte(sv, start);
16588 /* For small ranges that include printable ASCII characters, it's more
16589 * legible to print those characters rather than hex values. For
16590 * larger ranges that include more than printables, it's probably
16591 * clearer to just give the start and end points of the range in hex,
16592 * and that's all we can do if there aren't any printables within the
16595 * On ASCII platforms the range of printables is contiguous. If the
16596 * entire range is printable, we print each character as such. If the
16597 * range is partially printable and partially not, it's less likely
16598 * that the individual printables are meaningful, especially if all or
16599 * almost all of them are in the range. But we err on the side of the
16600 * individual printables being meaningful by using the hex only if the
16601 * range contains all but 2 of the printables.
16603 * On EBCDIC platforms, the printables are scattered around so that the
16604 * maximum range length containing only them is about 10. Anything
16605 * longer we treat as hex; otherwise we examine the range character by
16606 * character to see */
16608 if (start < 256 && (((end < 255) ? end : 255) - start <= 10))
16610 if ((isPRINT_A(start) && isPRINT_A(end))
16611 || (end >= 0x7F && (isPRINT_A(start) && start > 0x21))
16612 || ((end < 0x7D && isPRINT_A(end)) && start < 0x20))
16615 /* If the range beginning isn't an ASCII printable, we find the
16616 * last such in the range, then split the output, so all the
16617 * non-printables are in one subrange; then process the remaining
16618 * portion as usual. If the entire range isn't printables, we
16619 * don't split, but drop down to print as hex */
16620 if (! isPRINT_A(start)) {
16621 UV temp_end = start + 1;
16622 while (temp_end <= end && ! isPRINT_A(temp_end)) {
16625 if (temp_end <= end) {
16626 put_range(sv, start, temp_end - 1);
16632 /* If the range beginning is a digit, output a subrange of just the
16633 * digits, then process the remaining portion as usual */
16634 if (isDIGIT_A(start)) {
16635 put_byte(sv, start);
16636 sv_catpvs(sv, "-");
16637 while (start <= end && isDIGIT_A(start)) start++;
16638 put_byte(sv, start - 1);
16642 /* Similarly for alphabetics. Because in both ASCII and EBCDIC,
16643 * the code points for upper and lower A-Z and a-z aren't
16644 * intermixed, the resulting subrange will consist solely of either
16645 * upper- or lower- alphabetics */
16646 if (isALPHA_A(start)) {
16647 put_byte(sv, start);
16648 sv_catpvs(sv, "-");
16649 while (start <= end && isALPHA_A(start)) start++;
16650 put_byte(sv, start - 1);
16654 /* We output any remaining printables as individual characters */
16655 if (isPUNCT_A(start) || isSPACE_A(start)) {
16656 while (start <= end && (isPUNCT_A(start) || isSPACE_A(start))) {
16657 put_byte(sv, start);
16664 /* Here is a control or non-ascii. Output the range or subrange as
16666 Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16668 (end < 256) ? end : 255);
16674 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16676 /* Appends to 'sv' a displayable version of the innards of the bracketed
16677 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16678 * output anything */
16681 bool has_output_anything = FALSE;
16683 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16685 for (i = 0; i < 256; i++) {
16686 if (BITMAP_TEST((U8 *) bitmap,i)) {
16688 /* The character at index i should be output. Find the next
16689 * character that should NOT be output */
16691 for (j = i + 1; j < 256; j++) {
16692 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16697 /* Everything between them is a single range that should be output
16699 put_range(sv, i, j - 1);
16700 has_output_anything = TRUE;
16705 return has_output_anything;
16708 #define CLEAR_OPTSTART \
16709 if (optstart) STMT_START { \
16710 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
16711 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16715 #define DUMPUNTIL(b,e) \
16717 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16719 STATIC const regnode *
16720 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16721 const regnode *last, const regnode *plast,
16722 SV* sv, I32 indent, U32 depth)
16724 U8 op = PSEUDO; /* Arbitrary non-END op. */
16725 const regnode *next;
16726 const regnode *optstart= NULL;
16728 RXi_GET_DECL(r,ri);
16729 GET_RE_DEBUG_FLAGS_DECL;
16731 PERL_ARGS_ASSERT_DUMPUNTIL;
16733 #ifdef DEBUG_DUMPUNTIL
16734 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16735 last ? last-start : 0,plast ? plast-start : 0);
16738 if (plast && plast < last)
16741 while (PL_regkind[op] != END && (!last || node < last)) {
16743 /* While that wasn't END last time... */
16746 if (op == CLOSE || op == WHILEM)
16748 next = regnext((regnode *)node);
16751 if (OP(node) == OPTIMIZED) {
16752 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16759 regprop(r, sv, node, NULL);
16760 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16761 (int)(2*indent + 1), "", SvPVX_const(sv));
16763 if (OP(node) != OPTIMIZED) {
16764 if (next == NULL) /* Next ptr. */
16765 PerlIO_printf(Perl_debug_log, " (0)");
16766 else if (PL_regkind[(U8)op] == BRANCH
16767 && PL_regkind[OP(next)] != BRANCH )
16768 PerlIO_printf(Perl_debug_log, " (FAIL)");
16770 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16771 (void)PerlIO_putc(Perl_debug_log, '\n');
16775 if (PL_regkind[(U8)op] == BRANCHJ) {
16778 const regnode *nnode = (OP(next) == LONGJMP
16779 ? regnext((regnode *)next)
16781 if (last && nnode > last)
16783 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16786 else if (PL_regkind[(U8)op] == BRANCH) {
16788 DUMPUNTIL(NEXTOPER(node), next);
16790 else if ( PL_regkind[(U8)op] == TRIE ) {
16791 const regnode *this_trie = node;
16792 const char op = OP(node);
16793 const U32 n = ARG(node);
16794 const reg_ac_data * const ac = op>=AHOCORASICK ?
16795 (reg_ac_data *)ri->data->data[n] :
16797 const reg_trie_data * const trie =
16798 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16800 AV *const trie_words
16801 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16803 const regnode *nextbranch= NULL;
16806 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16807 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16809 PerlIO_printf(Perl_debug_log, "%*s%s ",
16810 (int)(2*(indent+3)), "",
16812 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16813 SvCUR(*elem_ptr), 60,
16814 PL_colors[0], PL_colors[1],
16816 ? PERL_PV_ESCAPE_UNI
16818 | PERL_PV_PRETTY_ELLIPSES
16819 | PERL_PV_PRETTY_LTGT
16824 U16 dist= trie->jump[word_idx+1];
16825 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16826 (UV)((dist ? this_trie + dist : next) - start));
16829 nextbranch= this_trie + trie->jump[0];
16830 DUMPUNTIL(this_trie + dist, nextbranch);
16832 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16833 nextbranch= regnext((regnode *)nextbranch);
16835 PerlIO_printf(Perl_debug_log, "\n");
16838 if (last && next > last)
16843 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16844 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16845 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16847 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16849 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16851 else if ( op == PLUS || op == STAR) {
16852 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16854 else if (PL_regkind[(U8)op] == ANYOF) {
16855 /* arglen 1 + class block */
16856 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16857 ? ANYOF_POSIXL_SKIP
16859 node = NEXTOPER(node);
16861 else if (PL_regkind[(U8)op] == EXACT) {
16862 /* Literal string, where present. */
16863 node += NODE_SZ_STR(node) - 1;
16864 node = NEXTOPER(node);
16867 node = NEXTOPER(node);
16868 node += regarglen[(U8)op];
16870 if (op == CURLYX || op == OPEN)
16874 #ifdef DEBUG_DUMPUNTIL
16875 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16880 #endif /* DEBUGGING */
16884 * c-indentation-style: bsd
16885 * c-basic-offset: 4
16886 * indent-tabs-mode: nil
16889 * ex: set ts=8 sts=4 sw=4 et: