5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 struct RExC_state_t {
107 U32 flags; /* RXf_* are we folding, multilining? */
108 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 SSize_t whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit_bound; /* First regnode outside of the
121 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
122 implies compiling, so don't emit */
123 regnode_ssc emit_dummy; /* placeholder for emit to point to;
124 large enough for the largest
125 non-EXACTish node, so can use it as
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 SSize_t size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN) plus
132 one. ("par" 0 is the whole
134 I32 nestroot; /* root parens we are in - used by
138 regnode **open_parens; /* pointers to open parens */
139 regnode **close_parens; /* pointers to close parens */
140 regnode *opend; /* END node in program */
141 I32 utf8; /* whether the pattern is utf8 or not */
142 I32 orig_utf8; /* whether the pattern was originally in utf8 */
143 /* XXX use this for future optimisation of case
144 * where pattern must be upgraded to utf8. */
145 I32 uni_semantics; /* If a d charset modifier should use unicode
146 rules, even if the pattern is not in
148 HV *paren_names; /* Paren names */
150 regnode **recurse; /* Recurse regops */
151 I32 recurse_count; /* Number of recurse regops */
152 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
154 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
158 I32 override_recoding;
159 I32 in_multi_char_class;
160 struct reg_code_block *code_blocks; /* positions of literal (?{})
162 int num_code_blocks; /* size of code_blocks[] */
163 int code_index; /* next code_blocks[] slot */
164 SSize_t maxlen; /* mininum possible number of chars in string to match */
165 #ifdef ADD_TO_REGEXEC
166 char *starttry; /* -Dr: where regtry was called. */
167 #define RExC_starttry (pRExC_state->starttry)
169 SV *runtime_code_qr; /* qr with the runtime code blocks */
171 const char *lastparse;
173 AV *paren_name_list; /* idx -> name */
174 #define RExC_lastparse (pRExC_state->lastparse)
175 #define RExC_lastnum (pRExC_state->lastnum)
176 #define RExC_paren_name_list (pRExC_state->paren_name_list)
180 #define RExC_flags (pRExC_state->flags)
181 #define RExC_pm_flags (pRExC_state->pm_flags)
182 #define RExC_precomp (pRExC_state->precomp)
183 #define RExC_rx_sv (pRExC_state->rx_sv)
184 #define RExC_rx (pRExC_state->rx)
185 #define RExC_rxi (pRExC_state->rxi)
186 #define RExC_start (pRExC_state->start)
187 #define RExC_end (pRExC_state->end)
188 #define RExC_parse (pRExC_state->parse)
189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
190 #ifdef RE_TRACK_PATTERN_OFFSETS
191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
194 #define RExC_emit (pRExC_state->emit)
195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_maxlen (pRExC_state->maxlen)
203 #define RExC_npar (pRExC_state->npar)
204 #define RExC_nestroot (pRExC_state->nestroot)
205 #define RExC_extralen (pRExC_state->extralen)
206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
207 #define RExC_utf8 (pRExC_state->utf8)
208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
210 #define RExC_open_parens (pRExC_state->open_parens)
211 #define RExC_close_parens (pRExC_state->close_parens)
212 #define RExC_opend (pRExC_state->opend)
213 #define RExC_paren_names (pRExC_state->paren_names)
214 #define RExC_recurse (pRExC_state->recurse)
215 #define RExC_recurse_count (pRExC_state->recurse_count)
216 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
217 #define RExC_study_chunk_recursed_bytes \
218 (pRExC_state->study_chunk_recursed_bytes)
219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
220 #define RExC_contains_locale (pRExC_state->contains_locale)
221 #define RExC_contains_i (pRExC_state->contains_i)
222 #define RExC_override_recoding (pRExC_state->override_recoding)
223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
228 ((*s) == '{' && regcurly(s)))
231 * Flags to be passed up and down.
233 #define WORST 0 /* Worst case. */
234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
237 * character. (There needs to be a case: in the switch statement in regexec.c
238 * for any node marked SIMPLE.) Note that this is not the same thing as
241 #define SPSTART 0x04 /* Starts with * or + */
242 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
244 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
248 /* whether trie related optimizations are enabled */
249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
250 #define TRIE_STUDY_OPT
251 #define FULL_TRIE_STUDY
257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
258 #define PBITVAL(paren) (1 << ((paren) & 7))
259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
263 #define REQUIRE_UTF8 STMT_START { \
265 *flagp = RESTART_UTF8; \
270 /* This converts the named class defined in regcomp.h to its equivalent class
271 * number defined in handy.h. */
272 #define namedclass_to_classnum(class) ((int) ((class) / 2))
273 #define classnum_to_namedclass(classnum) ((classnum) * 2)
275 #define _invlist_union_complement_2nd(a, b, output) \
276 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
277 #define _invlist_intersection_complement_2nd(a, b, output) \
278 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
280 /* About scan_data_t.
282 During optimisation we recurse through the regexp program performing
283 various inplace (keyhole style) optimisations. In addition study_chunk
284 and scan_commit populate this data structure with information about
285 what strings MUST appear in the pattern. We look for the longest
286 string that must appear at a fixed location, and we look for the
287 longest string that may appear at a floating location. So for instance
292 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
293 strings (because they follow a .* construct). study_chunk will identify
294 both FOO and BAR as being the longest fixed and floating strings respectively.
296 The strings can be composites, for instance
300 will result in a composite fixed substring 'foo'.
302 For each string some basic information is maintained:
304 - offset or min_offset
305 This is the position the string must appear at, or not before.
306 It also implicitly (when combined with minlenp) tells us how many
307 characters must match before the string we are searching for.
308 Likewise when combined with minlenp and the length of the string it
309 tells us how many characters must appear after the string we have
313 Only used for floating strings. This is the rightmost point that
314 the string can appear at. If set to SSize_t_MAX it indicates that the
315 string can occur infinitely far to the right.
318 A pointer to the minimum number of characters of the pattern that the
319 string was found inside. This is important as in the case of positive
320 lookahead or positive lookbehind we can have multiple patterns
325 The minimum length of the pattern overall is 3, the minimum length
326 of the lookahead part is 3, but the minimum length of the part that
327 will actually match is 1. So 'FOO's minimum length is 3, but the
328 minimum length for the F is 1. This is important as the minimum length
329 is used to determine offsets in front of and behind the string being
330 looked for. Since strings can be composites this is the length of the
331 pattern at the time it was committed with a scan_commit. Note that
332 the length is calculated by study_chunk, so that the minimum lengths
333 are not known until the full pattern has been compiled, thus the
334 pointer to the value.
338 In the case of lookbehind the string being searched for can be
339 offset past the start point of the final matching string.
340 If this value was just blithely removed from the min_offset it would
341 invalidate some of the calculations for how many chars must match
342 before or after (as they are derived from min_offset and minlen and
343 the length of the string being searched for).
344 When the final pattern is compiled and the data is moved from the
345 scan_data_t structure into the regexp structure the information
346 about lookbehind is factored in, with the information that would
347 have been lost precalculated in the end_shift field for the
350 The fields pos_min and pos_delta are used to store the minimum offset
351 and the delta to the maximum offset at the current point in the pattern.
355 typedef struct scan_data_t {
356 /*I32 len_min; unused */
357 /*I32 len_delta; unused */
361 SSize_t last_end; /* min value, <0 unless valid. */
362 SSize_t last_start_min;
363 SSize_t last_start_max;
364 SV **longest; /* Either &l_fixed, or &l_float. */
365 SV *longest_fixed; /* longest fixed string found in pattern */
366 SSize_t offset_fixed; /* offset where it starts */
367 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
368 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
369 SV *longest_float; /* longest floating string found in pattern */
370 SSize_t offset_float_min; /* earliest point in string it can appear */
371 SSize_t offset_float_max; /* latest point in string it can appear */
372 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
373 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
376 SSize_t *last_closep;
377 regnode_ssc *start_class;
380 /* The below is perhaps overboard, but this allows us to save a test at the
381 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
382 * and 'a' differ by a single bit; the same with the upper and lower case of
383 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
384 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
385 * then inverts it to form a mask, with just a single 0, in the bit position
386 * where the upper- and lowercase differ. XXX There are about 40 other
387 * instances in the Perl core where this micro-optimization could be used.
388 * Should decide if maintenance cost is worse, before changing those
390 * Returns a boolean as to whether or not 'v' is either a lowercase or
391 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
392 * compile-time constant, the generated code is better than some optimizing
393 * compilers figure out, amounting to a mask and test. The results are
394 * meaningless if 'c' is not one of [A-Za-z] */
395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
396 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
399 * Forward declarations for pregcomp()'s friends.
402 static const scan_data_t zero_scan_data =
403 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
405 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
406 #define SF_BEFORE_SEOL 0x0001
407 #define SF_BEFORE_MEOL 0x0002
408 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
409 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
411 #define SF_FIX_SHIFT_EOL (+2)
412 #define SF_FL_SHIFT_EOL (+4)
414 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
415 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
417 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
418 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
419 #define SF_IS_INF 0x0040
420 #define SF_HAS_PAR 0x0080
421 #define SF_IN_PAR 0x0100
422 #define SF_HAS_EVAL 0x0200
423 #define SCF_DO_SUBSTR 0x0400
424 #define SCF_DO_STCLASS_AND 0x0800
425 #define SCF_DO_STCLASS_OR 0x1000
426 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
427 #define SCF_WHILEM_VISITED_POS 0x2000
429 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
430 #define SCF_SEEN_ACCEPT 0x8000
431 #define SCF_TRIE_DOING_RESTUDY 0x10000
433 #define UTF cBOOL(RExC_utf8)
435 /* The enums for all these are ordered so things work out correctly */
436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
438 == REGEX_DEPENDS_CHARSET)
439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
441 >= REGEX_UNICODE_CHARSET)
442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
443 == REGEX_ASCII_RESTRICTED_CHARSET)
444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
445 >= REGEX_ASCII_RESTRICTED_CHARSET)
446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
447 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
451 /* For programs that want to be strictly Unicode compatible by dying if any
452 * attempt is made to match a non-Unicode code point against a Unicode
454 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
456 #define OOB_NAMEDCLASS -1
458 /* There is no code point that is out-of-bounds, so this is problematic. But
459 * its only current use is to initialize a variable that is always set before
461 #define OOB_UNICODE 0xDEADBEEF
463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
467 /* length of regex to show in messages that don't mark a position within */
468 #define RegexLengthToShowInErrorMessages 127
471 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
472 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
473 * op/pragma/warn/regcomp.
475 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
476 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
478 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
479 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
481 #define REPORT_LOCATION_ARGS(offset) \
482 UTF8fARG(UTF, offset, RExC_precomp), \
483 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
486 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
487 * arg. Show regex, up to a maximum length. If it's too long, chop and add
490 #define _FAIL(code) STMT_START { \
491 const char *ellipses = ""; \
492 IV len = RExC_end - RExC_precomp; \
495 SAVEFREESV(RExC_rx_sv); \
496 if (len > RegexLengthToShowInErrorMessages) { \
497 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
498 len = RegexLengthToShowInErrorMessages - 10; \
504 #define FAIL(msg) _FAIL( \
505 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
506 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
508 #define FAIL2(msg,arg) _FAIL( \
509 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
510 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
513 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
515 #define Simple_vFAIL(m) STMT_START { \
516 const IV offset = RExC_parse - RExC_precomp; \
517 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
518 m, REPORT_LOCATION_ARGS(offset)); \
522 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
524 #define vFAIL(m) STMT_START { \
526 SAVEFREESV(RExC_rx_sv); \
531 * Like Simple_vFAIL(), but accepts two arguments.
533 #define Simple_vFAIL2(m,a1) STMT_START { \
534 const IV offset = RExC_parse - RExC_precomp; \
535 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
536 REPORT_LOCATION_ARGS(offset)); \
540 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
542 #define vFAIL2(m,a1) STMT_START { \
544 SAVEFREESV(RExC_rx_sv); \
545 Simple_vFAIL2(m, a1); \
550 * Like Simple_vFAIL(), but accepts three arguments.
552 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
553 const IV offset = RExC_parse - RExC_precomp; \
554 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
555 REPORT_LOCATION_ARGS(offset)); \
559 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
561 #define vFAIL3(m,a1,a2) STMT_START { \
563 SAVEFREESV(RExC_rx_sv); \
564 Simple_vFAIL3(m, a1, a2); \
568 * Like Simple_vFAIL(), but accepts four arguments.
570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
571 const IV offset = RExC_parse - RExC_precomp; \
572 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
573 REPORT_LOCATION_ARGS(offset)); \
576 #define vFAIL4(m,a1,a2,a3) STMT_START { \
578 SAVEFREESV(RExC_rx_sv); \
579 Simple_vFAIL4(m, a1, a2, a3); \
582 /* A specialized version of vFAIL2 that works with UTF8f */
583 #define vFAIL2utf8f(m, a1) STMT_START { \
584 const IV offset = RExC_parse - RExC_precomp; \
586 SAVEFREESV(RExC_rx_sv); \
587 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
588 REPORT_LOCATION_ARGS(offset)); \
592 /* m is not necessarily a "literal string", in this macro */
593 #define reg_warn_non_literal_string(loc, m) STMT_START { \
594 const IV offset = loc - RExC_precomp; \
595 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
596 m, REPORT_LOCATION_ARGS(offset)); \
599 #define ckWARNreg(loc,m) STMT_START { \
600 const IV offset = loc - RExC_precomp; \
601 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
602 REPORT_LOCATION_ARGS(offset)); \
605 #define vWARN_dep(loc, m) STMT_START { \
606 const IV offset = loc - RExC_precomp; \
607 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
608 REPORT_LOCATION_ARGS(offset)); \
611 #define ckWARNdep(loc,m) STMT_START { \
612 const IV offset = loc - RExC_precomp; \
613 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
615 REPORT_LOCATION_ARGS(offset)); \
618 #define ckWARNregdep(loc,m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
622 REPORT_LOCATION_ARGS(offset)); \
625 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
629 a1, REPORT_LOCATION_ARGS(offset)); \
632 #define ckWARN2reg(loc, m, a1) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, REPORT_LOCATION_ARGS(offset)); \
638 #define vWARN3(loc, m, a1, a2) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
641 a1, a2, REPORT_LOCATION_ARGS(offset)); \
644 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
645 const IV offset = loc - RExC_precomp; \
646 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
647 a1, a2, REPORT_LOCATION_ARGS(offset)); \
650 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
653 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
657 const IV offset = loc - RExC_precomp; \
658 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
659 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
663 const IV offset = loc - RExC_precomp; \
664 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
665 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
669 /* Allow for side effects in s */
670 #define REGC(c,s) STMT_START { \
671 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
674 /* Macros for recording node offsets. 20001227 mjd@plover.com
675 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
676 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
677 * Element 0 holds the number n.
678 * Position is 1 indexed.
680 #ifndef RE_TRACK_PATTERN_OFFSETS
681 #define Set_Node_Offset_To_R(node,byte)
682 #define Set_Node_Offset(node,byte)
683 #define Set_Cur_Node_Offset
684 #define Set_Node_Length_To_R(node,len)
685 #define Set_Node_Length(node,len)
686 #define Set_Node_Cur_Length(node,start)
687 #define Node_Offset(n)
688 #define Node_Length(n)
689 #define Set_Node_Offset_Length(node,offset,len)
690 #define ProgLen(ri) ri->u.proglen
691 #define SetProgLen(ri,x) ri->u.proglen = x
693 #define ProgLen(ri) ri->u.offsets[0]
694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
695 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
697 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
698 __LINE__, (int)(node), (int)(byte))); \
700 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
703 RExC_offsets[2*(node)-1] = (byte); \
708 #define Set_Node_Offset(node,byte) \
709 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
712 #define Set_Node_Length_To_R(node,len) STMT_START { \
714 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
715 __LINE__, (int)(node), (int)(len))); \
717 Perl_croak(aTHX_ "value of node is %d in Length macro", \
720 RExC_offsets[2*(node)] = (len); \
725 #define Set_Node_Length(node,len) \
726 Set_Node_Length_To_R((node)-RExC_emit_start, len)
727 #define Set_Node_Cur_Length(node, start) \
728 Set_Node_Length(node, RExC_parse - start)
730 /* Get offsets and lengths */
731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
734 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
735 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
736 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
741 #define EXPERIMENTAL_INPLACESCAN
742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
744 #define DEBUG_RExC_seen() \
745 DEBUG_OPTIMISE_MORE_r({ \
746 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
748 if (RExC_seen & REG_ZERO_LEN_SEEN) \
749 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
751 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
752 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
754 if (RExC_seen & REG_GPOS_SEEN) \
755 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
757 if (RExC_seen & REG_CANY_SEEN) \
758 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
760 if (RExC_seen & REG_RECURSE_SEEN) \
761 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
763 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
764 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
766 if (RExC_seen & REG_VERBARG_SEEN) \
767 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
769 if (RExC_seen & REG_CUTGROUP_SEEN) \
770 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
772 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
773 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
775 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
776 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
778 if (RExC_seen & REG_GOSTART_SEEN) \
779 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
781 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
782 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
784 PerlIO_printf(Perl_debug_log,"\n"); \
787 #define DEBUG_STUDYDATA(str,data,depth) \
788 DEBUG_OPTIMISE_MORE_r(if(data){ \
789 PerlIO_printf(Perl_debug_log, \
790 "%*s" str "Pos:%"IVdf"/%"IVdf \
791 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
792 (int)(depth)*2, "", \
793 (IV)((data)->pos_min), \
794 (IV)((data)->pos_delta), \
795 (UV)((data)->flags), \
796 (IV)((data)->whilem_c), \
797 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
798 is_inf ? "INF " : "" \
800 if ((data)->last_found) \
801 PerlIO_printf(Perl_debug_log, \
802 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
803 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
804 SvPVX_const((data)->last_found), \
805 (IV)((data)->last_end), \
806 (IV)((data)->last_start_min), \
807 (IV)((data)->last_start_max), \
808 ((data)->longest && \
809 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
810 SvPVX_const((data)->longest_fixed), \
811 (IV)((data)->offset_fixed), \
812 ((data)->longest && \
813 (data)->longest==&((data)->longest_float)) ? "*" : "", \
814 SvPVX_const((data)->longest_float), \
815 (IV)((data)->offset_float_min), \
816 (IV)((data)->offset_float_max) \
818 PerlIO_printf(Perl_debug_log,"\n"); \
821 /* Mark that we cannot extend a found fixed substring at this point.
822 Update the longest found anchored substring and the longest found
823 floating substrings if needed. */
826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
827 SSize_t *minlenp, int is_inf)
829 const STRLEN l = CHR_SVLEN(data->last_found);
830 const STRLEN old_l = CHR_SVLEN(*data->longest);
831 GET_RE_DEBUG_FLAGS_DECL;
833 PERL_ARGS_ASSERT_SCAN_COMMIT;
835 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
836 SvSetMagicSV(*data->longest, data->last_found);
837 if (*data->longest == data->longest_fixed) {
838 data->offset_fixed = l ? data->last_start_min : data->pos_min;
839 if (data->flags & SF_BEFORE_EOL)
841 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
843 data->flags &= ~SF_FIX_BEFORE_EOL;
844 data->minlen_fixed=minlenp;
845 data->lookbehind_fixed=0;
847 else { /* *data->longest == data->longest_float */
848 data->offset_float_min = l ? data->last_start_min : data->pos_min;
849 data->offset_float_max = (l
850 ? data->last_start_max
851 : (data->pos_delta == SSize_t_MAX
853 : data->pos_min + data->pos_delta));
855 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
856 data->offset_float_max = SSize_t_MAX;
857 if (data->flags & SF_BEFORE_EOL)
859 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
861 data->flags &= ~SF_FL_BEFORE_EOL;
862 data->minlen_float=minlenp;
863 data->lookbehind_float=0;
866 SvCUR_set(data->last_found, 0);
868 SV * const sv = data->last_found;
869 if (SvUTF8(sv) && SvMAGICAL(sv)) {
870 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
876 data->flags &= ~SF_BEFORE_EOL;
877 DEBUG_STUDYDATA("commit: ",data,0);
880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
881 * list that describes which code points it matches */
884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
886 /* Set the SSC 'ssc' to match an empty string or any code point */
888 PERL_ARGS_ASSERT_SSC_ANYTHING;
890 assert(is_ANYOF_SYNTHETIC(ssc));
892 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
893 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
894 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
898 S_ssc_is_anything(const regnode_ssc *ssc)
900 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
901 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
902 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
903 * in any way, so there's no point in using it */
908 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
910 assert(is_ANYOF_SYNTHETIC(ssc));
912 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
916 /* See if the list consists solely of the range 0 - Infinity */
917 invlist_iterinit(ssc->invlist);
918 ret = invlist_iternext(ssc->invlist, &start, &end)
922 invlist_iterfinish(ssc->invlist);
928 /* If e.g., both \w and \W are set, matches everything */
929 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
931 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
932 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
944 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
945 * string, any code point, or any posix class under locale */
947 PERL_ARGS_ASSERT_SSC_INIT;
949 Zero(ssc, 1, regnode_ssc);
950 set_ANYOF_SYNTHETIC(ssc);
951 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
954 /* If any portion of the regex is to operate under locale rules,
955 * initialization includes it. The reason this isn't done for all regexes
956 * is that the optimizer was written under the assumption that locale was
957 * all-or-nothing. Given the complexity and lack of documentation in the
958 * optimizer, and that there are inadequate test cases for locale, many
959 * parts of it may not work properly, it is safest to avoid locale unless
961 if (RExC_contains_locale) {
962 ANYOF_POSIXL_SETALL(ssc);
965 ANYOF_POSIXL_ZERO(ssc);
970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
971 const regnode_ssc *ssc)
973 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
974 * to the list of code points matched, and locale posix classes; hence does
975 * not check its flags) */
980 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
982 assert(is_ANYOF_SYNTHETIC(ssc));
984 invlist_iterinit(ssc->invlist);
985 ret = invlist_iternext(ssc->invlist, &start, &end)
989 invlist_iterfinish(ssc->invlist);
995 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004 const regnode_charclass* const node)
1006 /* Returns a mortal inversion list defining which code points are matched
1007 * by 'node', which is of type ANYOF. Handles complementing the result if
1008 * appropriate. If some code points aren't knowable at this time, the
1009 * returned list must, and will, contain every code point that is a
1012 SV* invlist = sv_2mortal(_new_invlist(0));
1013 SV* only_utf8_locale_invlist = NULL;
1015 const U32 n = ARG(node);
1016 bool new_node_has_latin1 = FALSE;
1018 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1020 /* Look at the data structure created by S_set_ANYOF_arg() */
1021 if (n != ANYOF_NONBITMAP_EMPTY) {
1022 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023 AV * const av = MUTABLE_AV(SvRV(rv));
1024 SV **const ary = AvARRAY(av);
1025 assert(RExC_rxi->data->what[n] == 's');
1027 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1030 else if (ary[0] && ary[0] != &PL_sv_undef) {
1032 /* Here, no compile-time swash, and there are things that won't be
1033 * known until runtime -- we have to assume it could be anything */
1034 return _add_range_to_invlist(invlist, 0, UV_MAX);
1036 else if (ary[3] && ary[3] != &PL_sv_undef) {
1038 /* Here no compile-time swash, and no run-time only data. Use the
1039 * node's inversion list */
1040 invlist = sv_2mortal(invlist_clone(ary[3]));
1043 /* Get the code points valid only under UTF-8 locales */
1044 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045 && ary[2] && ary[2] != &PL_sv_undef)
1047 only_utf8_locale_invlist = ary[2];
1051 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052 * inversion list for the others, but if there are code points that should
1053 * match only conditionally on the target string being UTF-8, those are
1054 * placed in the inversion list, and not the bitmap. Since there are
1055 * circumstances under which they could match, they are included in the
1056 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1057 * here, so that when we invert below, the end result actually does include
1058 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1059 * before we add the unconditionally matched code points */
1060 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061 _invlist_intersection_complement_2nd(invlist,
1066 /* Add in the points from the bit map */
1067 for (i = 0; i < 256; i++) {
1068 if (ANYOF_BITMAP_TEST(node, i)) {
1069 invlist = add_cp_to_invlist(invlist, i);
1070 new_node_has_latin1 = TRUE;
1074 /* If this can match all upper Latin1 code points, have to add them
1076 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1080 /* Similarly for these */
1081 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1085 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086 _invlist_invert(invlist);
1088 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1090 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091 * locale. We can skip this if there are no 0-255 at all. */
1092 _invlist_union(invlist, PL_Latin1, &invlist);
1095 /* Similarly add the UTF-8 locale possible matches. These have to be
1096 * deferred until after the non-UTF-8 locale ones are taken care of just
1097 * above, or it leads to wrong results under ANYOF_INVERT */
1098 if (only_utf8_locale_invlist) {
1099 _invlist_union_maybe_complement_2nd(invlist,
1100 only_utf8_locale_invlist,
1101 ANYOF_FLAGS(node) & ANYOF_INVERT,
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero ssc_init
1111 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1114 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1115 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1116 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120 const regnode_charclass *and_with)
1122 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123 * another SSC or a regular ANYOF class. Can create false positives. */
1128 PERL_ARGS_ASSERT_SSC_AND;
1130 assert(is_ANYOF_SYNTHETIC(ssc));
1132 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133 * the code point inversion list and just the relevant flags */
1134 if (is_ANYOF_SYNTHETIC(and_with)) {
1135 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136 anded_flags = ANYOF_FLAGS(and_with);
1138 /* XXX This is a kludge around what appears to be deficiencies in the
1139 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1140 * there are paths through the optimizer where it doesn't get weeded
1141 * out when it should. And if we don't make some extra provision for
1142 * it like the code just below, it doesn't get added when it should.
1143 * This solution is to add it only when AND'ing, which is here, and
1144 * only when what is being AND'ed is the pristine, original node
1145 * matching anything. Thus it is like adding it to ssc_anything() but
1146 * only when the result is to be AND'ed. Probably the same solution
1147 * could be adopted for the same problem we have with /l matching,
1148 * which is solved differently in S_ssc_init(), and that would lead to
1149 * fewer false positives than that solution has. But if this solution
1150 * creates bugs, the consequences are only that a warning isn't raised
1151 * that should be; while the consequences for having /l bugs is
1152 * incorrect matches */
1153 if (ssc_is_anything((regnode_ssc *)and_with)) {
1154 anded_flags |= ANYOF_WARN_SUPER;
1158 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1162 ANYOF_FLAGS(ssc) &= anded_flags;
1164 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166 * 'and_with' may be inverted. When not inverted, we have the situation of
1168 * (C1 | P1) & (C2 | P2)
1169 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1172 * <= ((C1 & C2) | P1 | P2)
1173 * Alternatively, the last few steps could be:
1174 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1176 * <= (C1 | C2 | (P1 & P2))
1177 * We favor the second approach if either P1 or P2 is non-empty. This is
1178 * because these components are a barrier to doing optimizations, as what
1179 * they match cannot be known until the moment of matching as they are
1180 * dependent on the current locale, 'AND"ing them likely will reduce or
1182 * But we can do better if we know that C1,P1 are in their initial state (a
1183 * frequent occurrence), each matching everything:
1184 * (<everything>) & (C2 | P2) = C2 | P2
1185 * Similarly, if C2,P2 are in their initial state (again a frequent
1186 * occurrence), the result is a no-op
1187 * (C1 | P1) & (<everything>) = C1 | P1
1190 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1191 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192 * <= (C1 & ~C2) | (P1 & ~P2)
1195 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196 && ! is_ANYOF_SYNTHETIC(and_with))
1200 ssc_intersection(ssc,
1202 FALSE /* Has already been inverted */
1205 /* If either P1 or P2 is empty, the intersection will be also; can skip
1207 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208 ANYOF_POSIXL_ZERO(ssc);
1210 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1212 /* Note that the Posix class component P from 'and_with' actually
1214 * P = Pa | Pb | ... | Pn
1215 * where each component is one posix class, such as in [\w\s].
1217 * ~P = ~(Pa | Pb | ... | Pn)
1218 * = ~Pa & ~Pb & ... & ~Pn
1219 * <= ~Pa | ~Pb | ... | ~Pn
1220 * The last is something we can easily calculate, but unfortunately
1221 * is likely to have many false positives. We could do better
1222 * in some (but certainly not all) instances if two classes in
1223 * P have known relationships. For example
1224 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1226 * :lower: & :print: = :lower:
1227 * And similarly for classes that must be disjoint. For example,
1228 * since \s and \w can have no elements in common based on rules in
1229 * the POSIX standard,
1230 * \w & ^\S = nothing
1231 * Unfortunately, some vendor locales do not meet the Posix
1232 * standard, in particular almost everything by Microsoft.
1233 * The loop below just changes e.g., \w into \W and vice versa */
1235 regnode_charclass_posixl temp;
1236 int add = 1; /* To calculate the index of the complement */
1238 ANYOF_POSIXL_ZERO(&temp);
1239 for (i = 0; i < ANYOF_MAX; i++) {
1241 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1244 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245 ANYOF_POSIXL_SET(&temp, i + add);
1247 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1249 ANYOF_POSIXL_AND(&temp, ssc);
1251 } /* else ssc already has no posixes */
1252 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1253 in its initial state */
1254 else if (! is_ANYOF_SYNTHETIC(and_with)
1255 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1257 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258 * copy it over 'ssc' */
1259 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260 if (is_ANYOF_SYNTHETIC(and_with)) {
1261 StructCopy(and_with, ssc, regnode_ssc);
1264 ssc->invlist = anded_cp_list;
1265 ANYOF_POSIXL_ZERO(ssc);
1266 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1271 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1274 /* One or the other of P1, P2 is non-empty. */
1275 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1278 ssc_union(ssc, anded_cp_list, FALSE);
1280 else { /* P1 = P2 = empty */
1281 ssc_intersection(ssc, anded_cp_list, FALSE);
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288 const regnode_charclass *or_with)
1290 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291 * another SSC or a regular ANYOF class. Can create false positives if
1292 * 'or_with' is to be inverted. */
1297 PERL_ARGS_ASSERT_SSC_OR;
1299 assert(is_ANYOF_SYNTHETIC(ssc));
1301 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302 * the code point inversion list and just the relevant flags */
1303 if (is_ANYOF_SYNTHETIC(or_with)) {
1304 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305 ored_flags = ANYOF_FLAGS(or_with);
1308 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1312 ANYOF_FLAGS(ssc) |= ored_flags;
1314 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316 * 'or_with' may be inverted. When not inverted, we have the simple
1317 * situation of computing:
1318 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1319 * If P1|P2 yields a situation with both a class and its complement are
1320 * set, like having both \w and \W, this matches all code points, and we
1321 * can delete these from the P component of the ssc going forward. XXX We
1322 * might be able to delete all the P components, but I (khw) am not certain
1323 * about this, and it is better to be safe.
1326 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1327 * <= (C1 | P1) | ~C2
1328 * <= (C1 | ~C2) | P1
1329 * (which results in actually simpler code than the non-inverted case)
1332 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333 && ! is_ANYOF_SYNTHETIC(or_with))
1335 /* We ignore P2, leaving P1 going forward */
1336 } /* else Not inverted */
1337 else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1341 for (i = 0; i < ANYOF_MAX; i += 2) {
1342 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1344 ssc_match_all_cp(ssc);
1345 ANYOF_POSIXL_CLEAR(ssc, i);
1346 ANYOF_POSIXL_CLEAR(ssc, i+1);
1354 FALSE /* Already has been inverted */
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1361 PERL_ARGS_ASSERT_SSC_UNION;
1363 assert(is_ANYOF_SYNTHETIC(ssc));
1365 _invlist_union_maybe_complement_2nd(ssc->invlist,
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1374 const bool invert2nd)
1376 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1378 assert(is_ANYOF_SYNTHETIC(ssc));
1380 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1389 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1391 assert(is_ANYOF_SYNTHETIC(ssc));
1393 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1399 /* AND just the single code point 'cp' into the SSC 'ssc' */
1401 SV* cp_list = _new_invlist(2);
1403 PERL_ARGS_ASSERT_SSC_CP_AND;
1405 assert(is_ANYOF_SYNTHETIC(ssc));
1407 cp_list = add_cp_to_invlist(cp_list, cp);
1408 ssc_intersection(ssc, cp_list,
1409 FALSE /* Not inverted */
1411 SvREFCNT_dec_NN(cp_list);
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1417 /* Set the SSC 'ssc' to not match any locale things */
1418 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1420 assert(is_ANYOF_SYNTHETIC(ssc));
1422 ANYOF_POSIXL_ZERO(ssc);
1423 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1429 /* The inversion list in the SSC is marked mortal; now we need a more
1430 * permanent copy, which is stored the same way that is done in a regular
1431 * ANYOF node, with the first 256 code points in a bit map */
1433 SV* invlist = invlist_clone(ssc->invlist);
1435 PERL_ARGS_ASSERT_SSC_FINALIZE;
1437 assert(is_ANYOF_SYNTHETIC(ssc));
1439 /* The code in this file assumes that all but these flags aren't relevant
1440 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441 * time we reach here */
1442 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1444 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1446 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447 NULL, NULL, NULL, FALSE);
1449 /* Make sure is clone-safe */
1450 ssc->invlist = NULL;
1452 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453 ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1456 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1463 ? (TRIE_LIST_CUR( idx ) - 1) \
1469 dump_trie(trie,widecharmap,revcharmap)
1470 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1473 These routines dump out a trie in a somewhat readable format.
1474 The _interim_ variants are used for debugging the interim
1475 tables that are used to generate the final compressed
1476 representation which is what dump_trie expects.
1478 Part of the reason for their existence is to provide a form
1479 of documentation as to how the different representations function.
1484 Dumps the final compressed table form of the trie to Perl_debug_log.
1485 Used for debugging make_trie().
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490 AV *revcharmap, U32 depth)
1493 SV *sv=sv_newmortal();
1494 int colwidth= widecharmap ? 6 : 4;
1496 GET_RE_DEBUG_FLAGS_DECL;
1498 PERL_ARGS_ASSERT_DUMP_TRIE;
1500 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501 (int)depth * 2 + 2,"",
1502 "Match","Base","Ofs" );
1504 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505 SV ** const tmp = av_fetch( revcharmap, state, 0);
1507 PerlIO_printf( Perl_debug_log, "%*s",
1509 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510 PL_colors[0], PL_colors[1],
1511 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512 PERL_PV_ESCAPE_FIRSTCHAR
1517 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518 (int)depth * 2 + 2,"");
1520 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522 PerlIO_printf( Perl_debug_log, "\n");
1524 for( state = 1 ; state < trie->statecount ; state++ ) {
1525 const U32 base = trie->states[ state ].trans.base;
1527 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528 (int)depth * 2 + 2,"", (UV)state);
1530 if ( trie->states[ state ].wordnum ) {
1531 PerlIO_printf( Perl_debug_log, " W%4X",
1532 trie->states[ state ].wordnum );
1534 PerlIO_printf( Perl_debug_log, "%6s", "" );
1537 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1542 while( ( base + ofs < trie->uniquecharcount ) ||
1543 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1548 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1550 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551 if ( ( base + ofs >= trie->uniquecharcount )
1552 && ( base + ofs - trie->uniquecharcount
1554 && trie->trans[ base + ofs
1555 - trie->uniquecharcount ].check == state )
1557 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1559 (UV)trie->trans[ base + ofs
1560 - trie->uniquecharcount ].next );
1562 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1566 PerlIO_printf( Perl_debug_log, "]");
1569 PerlIO_printf( Perl_debug_log, "\n" );
1571 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1573 for (word=1; word <= trie->wordcount; word++) {
1574 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575 (int)word, (int)(trie->wordinfo[word].prev),
1576 (int)(trie->wordinfo[word].len));
1578 PerlIO_printf(Perl_debug_log, "\n" );
1581 Dumps a fully constructed but uncompressed trie in list form.
1582 List tries normally only are used for construction when the number of
1583 possible chars (trie->uniquecharcount) is very high.
1584 Used for debugging make_trie().
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1592 SV *sv=sv_newmortal();
1593 int colwidth= widecharmap ? 6 : 4;
1594 GET_RE_DEBUG_FLAGS_DECL;
1596 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1598 /* print out the table precompression. */
1599 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601 "------:-----+-----------------\n" );
1603 for( state=1 ; state < next_alloc ; state ++ ) {
1606 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607 (int)depth * 2 + 2,"", (UV)state );
1608 if ( ! trie->states[ state ].wordnum ) {
1609 PerlIO_printf( Perl_debug_log, "%5s| ","");
1611 PerlIO_printf( Perl_debug_log, "W%4x| ",
1612 trie->states[ state ].wordnum
1615 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616 SV ** const tmp = av_fetch( revcharmap,
1617 TRIE_LIST_ITEM(state,charid).forid, 0);
1619 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1621 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1623 PL_colors[0], PL_colors[1],
1624 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625 | PERL_PV_ESCAPE_FIRSTCHAR
1627 TRIE_LIST_ITEM(state,charid).forid,
1628 (UV)TRIE_LIST_ITEM(state,charid).newstate
1631 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632 (int)((depth * 2) + 14), "");
1635 PerlIO_printf( Perl_debug_log, "\n");
1640 Dumps a fully constructed but uncompressed trie in table form.
1641 This is the normal DFA style state transition table, with a few
1642 twists to facilitate compression later.
1643 Used for debugging make_trie().
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1652 SV *sv=sv_newmortal();
1653 int colwidth= widecharmap ? 6 : 4;
1654 GET_RE_DEBUG_FLAGS_DECL;
1656 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1659 print out the table precompression so that we can do a visual check
1660 that they are identical.
1663 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1665 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1668 PerlIO_printf( Perl_debug_log, "%*s",
1670 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671 PL_colors[0], PL_colors[1],
1672 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673 PERL_PV_ESCAPE_FIRSTCHAR
1679 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1681 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1685 PerlIO_printf( Perl_debug_log, "\n" );
1687 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1689 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690 (int)depth * 2 + 2,"",
1691 (UV)TRIE_NODENUM( state ) );
1693 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1696 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1698 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1700 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702 (UV)trie->trans[ state ].check );
1704 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705 (UV)trie->trans[ state ].check,
1706 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715 startbranch: the first branch in the whole branch sequence
1716 first : start branch of sequence of branch-exact nodes.
1717 May be the same as startbranch
1718 last : Thing following the last branch.
1719 May be the same as tail.
1720 tail : item following the branch sequence
1721 count : words in the sequence
1722 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723 depth : indent depth
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1739 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1743 (1) +-i->(6)-+-s->[7]
1745 +-s->(3)-+-h->(4)-+-e->[5]
1747 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1760 / (DUPE|DUPE) X? (?{ ... }) Y /x
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1767 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1769 which prints out 'word' three times, but
1771 'words'=~/(word|word|word)(?{ print $1 })S/
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1775 Example of what happens on a structural level:
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1779 1: CURLYM[1] {1,32767}(18)
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1793 1: CURLYM[1] {1,32767}(18)
1795 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1817 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1824 d = uvchr_to_utf8_flags(d, uv, 0);
1826 is the recommended Unicode-aware way of saying
1831 #define TRIE_STORE_REVCHAR(val) \
1834 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1835 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1836 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837 SvCUR_set(zlopp, kapow - flrbbbbb); \
1840 av_push(revcharmap, zlopp); \
1842 char ooooff = (char)val; \
1843 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1847 /* This gets the next character from the input, folding it if not already
1849 #define TRIE_READ_CHAR STMT_START { \
1852 /* if it is UTF then it is either already folded, or does not need \
1854 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1856 else if (folder == PL_fold_latin1) { \
1857 /* This folder implies Unicode rules, which in the range expressible \
1858 * by not UTF is the lower case, with the two exceptions, one of \
1859 * which should have been taken care of before calling this */ \
1860 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1861 uvc = toLOWER_L1(*uc); \
1862 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1865 /* raw data, will be folded later if needed */ \
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1874 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1875 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1876 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1878 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1879 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1880 TRIE_LIST_CUR( state )++; \
1883 #define TRIE_LIST_NEW(state) STMT_START { \
1884 Newxz( trie->states[ state ].trans.list, \
1885 4, reg_trie_trans_le ); \
1886 TRIE_LIST_CUR( state ) = 1; \
1887 TRIE_LIST_LEN( state ) = 4; \
1890 #define TRIE_HANDLE_WORD(state) STMT_START { \
1891 U16 dupe= trie->states[ state ].wordnum; \
1892 regnode * const noper_next = regnext( noper ); \
1895 /* store the word for dumping */ \
1897 if (OP(noper) != NOTHING) \
1898 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1900 tmp = newSVpvn_utf8( "", 0, UTF ); \
1901 av_push( trie_words, tmp ); \
1905 trie->wordinfo[curword].prev = 0; \
1906 trie->wordinfo[curword].len = wordlen; \
1907 trie->wordinfo[curword].accept = state; \
1909 if ( noper_next < tail ) { \
1911 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1913 trie->jump[curword] = (U16)(noper_next - convert); \
1915 jumper = noper_next; \
1917 nextbranch= regnext(cur); \
1921 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1922 /* chain, so that when the bits of chain are later */\
1923 /* linked together, the dups appear in the chain */\
1924 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925 trie->wordinfo[dupe].prev = curword; \
1927 /* we haven't inserted this word yet. */ \
1928 trie->states[ state ].wordnum = curword; \
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1934 ( ( base + charid >= ucharcount \
1935 && base + charid < ubound \
1936 && state == trie->trans[ base - ucharcount + charid ].check \
1937 && trie->trans[ base - ucharcount + charid ].next ) \
1938 ? trie->trans[ base - ucharcount + charid ].next \
1939 : ( state==1 ? special : 0 ) \
1943 #define MADE_JUMP_TRIE 2
1944 #define MADE_EXACT_TRIE 4
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948 regnode *first, regnode *last, regnode *tail,
1949 U32 word_count, U32 flags, U32 depth)
1952 /* first pass, loop through and scan words */
1953 reg_trie_data *trie;
1954 HV *widecharmap = NULL;
1955 AV *revcharmap = newAV();
1961 regnode *jumper = NULL;
1962 regnode *nextbranch = NULL;
1963 regnode *convert = NULL;
1964 U32 *prev_states; /* temp array mapping each state to previous one */
1965 /* we just use folder as a flag in utf8 */
1966 const U8 * folder = NULL;
1969 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1970 AV *trie_words = NULL;
1971 /* along with revcharmap, this only used during construction but both are
1972 * useful during debugging so we store them in the struct when debugging.
1975 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1976 STRLEN trie_charcount=0;
1978 SV *re_trie_maxbuff;
1979 GET_RE_DEBUG_FLAGS_DECL;
1981 PERL_ARGS_ASSERT_MAKE_TRIE;
1983 PERL_UNUSED_ARG(depth);
1990 case EXACTFU: folder = PL_fold_latin1; break;
1991 case EXACTF: folder = PL_fold; break;
1992 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1995 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1997 trie->startstate = 1;
1998 trie->wordcount = word_count;
1999 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2000 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2002 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2003 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2004 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2007 trie_words = newAV();
2010 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2011 assert(re_trie_maxbuff);
2012 if (!SvIOK(re_trie_maxbuff)) {
2013 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2015 DEBUG_TRIE_COMPILE_r({
2016 PerlIO_printf( Perl_debug_log,
2017 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2018 (int)depth * 2 + 2, "",
2019 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2020 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2023 /* Find the node we are going to overwrite */
2024 if ( first == startbranch && OP( last ) != BRANCH ) {
2025 /* whole branch chain */
2028 /* branch sub-chain */
2029 convert = NEXTOPER( first );
2032 /* -- First loop and Setup --
2034 We first traverse the branches and scan each word to determine if it
2035 contains widechars, and how many unique chars there are, this is
2036 important as we have to build a table with at least as many columns as we
2039 We use an array of integers to represent the character codes 0..255
2040 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2041 the native representation of the character value as the key and IV's for
2044 *TODO* If we keep track of how many times each character is used we can
2045 remap the columns so that the table compression later on is more
2046 efficient in terms of memory by ensuring the most common value is in the
2047 middle and the least common are on the outside. IMO this would be better
2048 than a most to least common mapping as theres a decent chance the most
2049 common letter will share a node with the least common, meaning the node
2050 will not be compressible. With a middle is most common approach the worst
2051 case is when we have the least common nodes twice.
2055 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2056 regnode *noper = NEXTOPER( cur );
2057 const U8 *uc = (U8*)STRING( noper );
2058 const U8 *e = uc + STR_LEN( noper );
2060 U32 wordlen = 0; /* required init */
2061 STRLEN minchars = 0;
2062 STRLEN maxchars = 0;
2063 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2066 if (OP(noper) == NOTHING) {
2067 regnode *noper_next= regnext(noper);
2068 if (noper_next != tail && OP(noper_next) == flags) {
2070 uc= (U8*)STRING(noper);
2071 e= uc + STR_LEN(noper);
2072 trie->minlen= STR_LEN(noper);
2079 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2080 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2081 regardless of encoding */
2082 if (OP( noper ) == EXACTFU_SS) {
2083 /* false positives are ok, so just set this */
2084 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2087 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2089 TRIE_CHARCOUNT(trie)++;
2092 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2093 * is in effect. Under /i, this character can match itself, or
2094 * anything that folds to it. If not under /i, it can match just
2095 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2096 * all fold to k, and all are single characters. But some folds
2097 * expand to more than one character, so for example LATIN SMALL
2098 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2099 * the string beginning at 'uc' is 'ffi', it could be matched by
2100 * three characters, or just by the one ligature character. (It
2101 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2102 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2103 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2104 * match.) The trie needs to know the minimum and maximum number
2105 * of characters that could match so that it can use size alone to
2106 * quickly reject many match attempts. The max is simple: it is
2107 * the number of folded characters in this branch (since a fold is
2108 * never shorter than what folds to it. */
2112 /* And the min is equal to the max if not under /i (indicated by
2113 * 'folder' being NULL), or there are no multi-character folds. If
2114 * there is a multi-character fold, the min is incremented just
2115 * once, for the character that folds to the sequence. Each
2116 * character in the sequence needs to be added to the list below of
2117 * characters in the trie, but we count only the first towards the
2118 * min number of characters needed. This is done through the
2119 * variable 'foldlen', which is returned by the macros that look
2120 * for these sequences as the number of bytes the sequence
2121 * occupies. Each time through the loop, we decrement 'foldlen' by
2122 * how many bytes the current char occupies. Only when it reaches
2123 * 0 do we increment 'minchars' or look for another multi-character
2125 if (folder == NULL) {
2128 else if (foldlen > 0) {
2129 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2134 /* See if *uc is the beginning of a multi-character fold. If
2135 * so, we decrement the length remaining to look at, to account
2136 * for the current character this iteration. (We can use 'uc'
2137 * instead of the fold returned by TRIE_READ_CHAR because for
2138 * non-UTF, the latin1_safe macro is smart enough to account
2139 * for all the unfolded characters, and because for UTF, the
2140 * string will already have been folded earlier in the
2141 * compilation process */
2143 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2144 foldlen -= UTF8SKIP(uc);
2147 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2152 /* The current character (and any potential folds) should be added
2153 * to the possible matching characters for this position in this
2157 U8 folded= folder[ (U8) uvc ];
2158 if ( !trie->charmap[ folded ] ) {
2159 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2160 TRIE_STORE_REVCHAR( folded );
2163 if ( !trie->charmap[ uvc ] ) {
2164 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2165 TRIE_STORE_REVCHAR( uvc );
2168 /* store the codepoint in the bitmap, and its folded
2170 TRIE_BITMAP_SET(trie, uvc);
2172 /* store the folded codepoint */
2173 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2176 /* store first byte of utf8 representation of
2177 variant codepoints */
2178 if (! UVCHR_IS_INVARIANT(uvc)) {
2179 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2182 set_bit = 0; /* We've done our bit :-) */
2186 /* XXX We could come up with the list of code points that fold
2187 * to this using PL_utf8_foldclosures, except not for
2188 * multi-char folds, as there may be multiple combinations
2189 * there that could work, which needs to wait until runtime to
2190 * resolve (The comment about LIGATURE FFI above is such an
2195 widecharmap = newHV();
2197 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2200 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2202 if ( !SvTRUE( *svpp ) ) {
2203 sv_setiv( *svpp, ++trie->uniquecharcount );
2204 TRIE_STORE_REVCHAR(uvc);
2207 } /* end loop through characters in this branch of the trie */
2209 /* We take the min and max for this branch and combine to find the min
2210 * and max for all branches processed so far */
2211 if( cur == first ) {
2212 trie->minlen = minchars;
2213 trie->maxlen = maxchars;
2214 } else if (minchars < trie->minlen) {
2215 trie->minlen = minchars;
2216 } else if (maxchars > trie->maxlen) {
2217 trie->maxlen = maxchars;
2219 } /* end first pass */
2220 DEBUG_TRIE_COMPILE_r(
2221 PerlIO_printf( Perl_debug_log,
2222 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2223 (int)depth * 2 + 2,"",
2224 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2225 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2226 (int)trie->minlen, (int)trie->maxlen )
2230 We now know what we are dealing with in terms of unique chars and
2231 string sizes so we can calculate how much memory a naive
2232 representation using a flat table will take. If it's over a reasonable
2233 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2234 conservative but potentially much slower representation using an array
2237 At the end we convert both representations into the same compressed
2238 form that will be used in regexec.c for matching with. The latter
2239 is a form that cannot be used to construct with but has memory
2240 properties similar to the list form and access properties similar
2241 to the table form making it both suitable for fast searches and
2242 small enough that its feasable to store for the duration of a program.
2244 See the comment in the code where the compressed table is produced
2245 inplace from the flat tabe representation for an explanation of how
2246 the compression works.
2251 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2254 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2255 > SvIV(re_trie_maxbuff) )
2258 Second Pass -- Array Of Lists Representation
2260 Each state will be represented by a list of charid:state records
2261 (reg_trie_trans_le) the first such element holds the CUR and LEN
2262 points of the allocated array. (See defines above).
2264 We build the initial structure using the lists, and then convert
2265 it into the compressed table form which allows faster lookups
2266 (but cant be modified once converted).
2269 STRLEN transcount = 1;
2271 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2272 "%*sCompiling trie using list compiler\n",
2273 (int)depth * 2 + 2, ""));
2275 trie->states = (reg_trie_state *)
2276 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2277 sizeof(reg_trie_state) );
2281 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2283 regnode *noper = NEXTOPER( cur );
2284 U8 *uc = (U8*)STRING( noper );
2285 const U8 *e = uc + STR_LEN( noper );
2286 U32 state = 1; /* required init */
2287 U16 charid = 0; /* sanity init */
2288 U32 wordlen = 0; /* required init */
2290 if (OP(noper) == NOTHING) {
2291 regnode *noper_next= regnext(noper);
2292 if (noper_next != tail && OP(noper_next) == flags) {
2294 uc= (U8*)STRING(noper);
2295 e= uc + STR_LEN(noper);
2299 if (OP(noper) != NOTHING) {
2300 for ( ; uc < e ; uc += len ) {
2305 charid = trie->charmap[ uvc ];
2307 SV** const svpp = hv_fetch( widecharmap,
2314 charid=(U16)SvIV( *svpp );
2317 /* charid is now 0 if we dont know the char read, or
2318 * nonzero if we do */
2325 if ( !trie->states[ state ].trans.list ) {
2326 TRIE_LIST_NEW( state );
2329 check <= TRIE_LIST_USED( state );
2332 if ( TRIE_LIST_ITEM( state, check ).forid
2335 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2340 newstate = next_alloc++;
2341 prev_states[newstate] = state;
2342 TRIE_LIST_PUSH( state, charid, newstate );
2347 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2351 TRIE_HANDLE_WORD(state);
2353 } /* end second pass */
2355 /* next alloc is the NEXT state to be allocated */
2356 trie->statecount = next_alloc;
2357 trie->states = (reg_trie_state *)
2358 PerlMemShared_realloc( trie->states,
2360 * sizeof(reg_trie_state) );
2362 /* and now dump it out before we compress it */
2363 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2364 revcharmap, next_alloc,
2368 trie->trans = (reg_trie_trans *)
2369 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2376 for( state=1 ; state < next_alloc ; state ++ ) {
2380 DEBUG_TRIE_COMPILE_MORE_r(
2381 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2385 if (trie->states[state].trans.list) {
2386 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2390 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2391 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2392 if ( forid < minid ) {
2394 } else if ( forid > maxid ) {
2398 if ( transcount < tp + maxid - minid + 1) {
2400 trie->trans = (reg_trie_trans *)
2401 PerlMemShared_realloc( trie->trans,
2403 * sizeof(reg_trie_trans) );
2404 Zero( trie->trans + (transcount / 2),
2408 base = trie->uniquecharcount + tp - minid;
2409 if ( maxid == minid ) {
2411 for ( ; zp < tp ; zp++ ) {
2412 if ( ! trie->trans[ zp ].next ) {
2413 base = trie->uniquecharcount + zp - minid;
2414 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2416 trie->trans[ zp ].check = state;
2422 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2424 trie->trans[ tp ].check = state;
2429 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2430 const U32 tid = base
2431 - trie->uniquecharcount
2432 + TRIE_LIST_ITEM( state, idx ).forid;
2433 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2435 trie->trans[ tid ].check = state;
2437 tp += ( maxid - minid + 1 );
2439 Safefree(trie->states[ state ].trans.list);
2442 DEBUG_TRIE_COMPILE_MORE_r(
2443 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2446 trie->states[ state ].trans.base=base;
2448 trie->lasttrans = tp + 1;
2452 Second Pass -- Flat Table Representation.
2454 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2455 each. We know that we will need Charcount+1 trans at most to store
2456 the data (one row per char at worst case) So we preallocate both
2457 structures assuming worst case.
2459 We then construct the trie using only the .next slots of the entry
2462 We use the .check field of the first entry of the node temporarily
2463 to make compression both faster and easier by keeping track of how
2464 many non zero fields are in the node.
2466 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2469 There are two terms at use here: state as a TRIE_NODEIDX() which is
2470 a number representing the first entry of the node, and state as a
2471 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2472 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2473 if there are 2 entrys per node. eg:
2481 The table is internally in the right hand, idx form. However as we
2482 also have to deal with the states array which is indexed by nodenum
2483 we have to use TRIE_NODENUM() to convert.
2486 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2487 "%*sCompiling trie using table compiler\n",
2488 (int)depth * 2 + 2, ""));
2490 trie->trans = (reg_trie_trans *)
2491 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2492 * trie->uniquecharcount + 1,
2493 sizeof(reg_trie_trans) );
2494 trie->states = (reg_trie_state *)
2495 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2496 sizeof(reg_trie_state) );
2497 next_alloc = trie->uniquecharcount + 1;
2500 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2502 regnode *noper = NEXTOPER( cur );
2503 const U8 *uc = (U8*)STRING( noper );
2504 const U8 *e = uc + STR_LEN( noper );
2506 U32 state = 1; /* required init */
2508 U16 charid = 0; /* sanity init */
2509 U32 accept_state = 0; /* sanity init */
2511 U32 wordlen = 0; /* required init */
2513 if (OP(noper) == NOTHING) {
2514 regnode *noper_next= regnext(noper);
2515 if (noper_next != tail && OP(noper_next) == flags) {
2517 uc= (U8*)STRING(noper);
2518 e= uc + STR_LEN(noper);
2522 if ( OP(noper) != NOTHING ) {
2523 for ( ; uc < e ; uc += len ) {
2528 charid = trie->charmap[ uvc ];
2530 SV* const * const svpp = hv_fetch( widecharmap,
2534 charid = svpp ? (U16)SvIV(*svpp) : 0;
2538 if ( !trie->trans[ state + charid ].next ) {
2539 trie->trans[ state + charid ].next = next_alloc;
2540 trie->trans[ state ].check++;
2541 prev_states[TRIE_NODENUM(next_alloc)]
2542 = TRIE_NODENUM(state);
2543 next_alloc += trie->uniquecharcount;
2545 state = trie->trans[ state + charid ].next;
2547 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2549 /* charid is now 0 if we dont know the char read, or
2550 * nonzero if we do */
2553 accept_state = TRIE_NODENUM( state );
2554 TRIE_HANDLE_WORD(accept_state);
2556 } /* end second pass */
2558 /* and now dump it out before we compress it */
2559 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2561 next_alloc, depth+1));
2565 * Inplace compress the table.*
2567 For sparse data sets the table constructed by the trie algorithm will
2568 be mostly 0/FAIL transitions or to put it another way mostly empty.
2569 (Note that leaf nodes will not contain any transitions.)
2571 This algorithm compresses the tables by eliminating most such
2572 transitions, at the cost of a modest bit of extra work during lookup:
2574 - Each states[] entry contains a .base field which indicates the
2575 index in the state[] array wheres its transition data is stored.
2577 - If .base is 0 there are no valid transitions from that node.
2579 - If .base is nonzero then charid is added to it to find an entry in
2582 -If trans[states[state].base+charid].check!=state then the
2583 transition is taken to be a 0/Fail transition. Thus if there are fail
2584 transitions at the front of the node then the .base offset will point
2585 somewhere inside the previous nodes data (or maybe even into a node
2586 even earlier), but the .check field determines if the transition is
2590 The following process inplace converts the table to the compressed
2591 table: We first do not compress the root node 1,and mark all its
2592 .check pointers as 1 and set its .base pointer as 1 as well. This
2593 allows us to do a DFA construction from the compressed table later,
2594 and ensures that any .base pointers we calculate later are greater
2597 - We set 'pos' to indicate the first entry of the second node.
2599 - We then iterate over the columns of the node, finding the first and
2600 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2601 and set the .check pointers accordingly, and advance pos
2602 appropriately and repreat for the next node. Note that when we copy
2603 the next pointers we have to convert them from the original
2604 NODEIDX form to NODENUM form as the former is not valid post
2607 - If a node has no transitions used we mark its base as 0 and do not
2608 advance the pos pointer.
2610 - If a node only has one transition we use a second pointer into the
2611 structure to fill in allocated fail transitions from other states.
2612 This pointer is independent of the main pointer and scans forward
2613 looking for null transitions that are allocated to a state. When it
2614 finds one it writes the single transition into the "hole". If the
2615 pointer doesnt find one the single transition is appended as normal.
2617 - Once compressed we can Renew/realloc the structures to release the
2620 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2621 specifically Fig 3.47 and the associated pseudocode.
2625 const U32 laststate = TRIE_NODENUM( next_alloc );
2628 trie->statecount = laststate;
2630 for ( state = 1 ; state < laststate ; state++ ) {
2632 const U32 stateidx = TRIE_NODEIDX( state );
2633 const U32 o_used = trie->trans[ stateidx ].check;
2634 U32 used = trie->trans[ stateidx ].check;
2635 trie->trans[ stateidx ].check = 0;
2638 used && charid < trie->uniquecharcount;
2641 if ( flag || trie->trans[ stateidx + charid ].next ) {
2642 if ( trie->trans[ stateidx + charid ].next ) {
2644 for ( ; zp < pos ; zp++ ) {
2645 if ( ! trie->trans[ zp ].next ) {
2649 trie->states[ state ].trans.base
2651 + trie->uniquecharcount
2653 trie->trans[ zp ].next
2654 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2656 trie->trans[ zp ].check = state;
2657 if ( ++zp > pos ) pos = zp;
2664 trie->states[ state ].trans.base
2665 = pos + trie->uniquecharcount - charid ;
2667 trie->trans[ pos ].next
2668 = SAFE_TRIE_NODENUM(
2669 trie->trans[ stateidx + charid ].next );
2670 trie->trans[ pos ].check = state;
2675 trie->lasttrans = pos + 1;
2676 trie->states = (reg_trie_state *)
2677 PerlMemShared_realloc( trie->states, laststate
2678 * sizeof(reg_trie_state) );
2679 DEBUG_TRIE_COMPILE_MORE_r(
2680 PerlIO_printf( Perl_debug_log,
2681 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2682 (int)depth * 2 + 2,"",
2683 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2687 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2690 } /* end table compress */
2692 DEBUG_TRIE_COMPILE_MORE_r(
2693 PerlIO_printf(Perl_debug_log,
2694 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2695 (int)depth * 2 + 2, "",
2696 (UV)trie->statecount,
2697 (UV)trie->lasttrans)
2699 /* resize the trans array to remove unused space */
2700 trie->trans = (reg_trie_trans *)
2701 PerlMemShared_realloc( trie->trans, trie->lasttrans
2702 * sizeof(reg_trie_trans) );
2704 { /* Modify the program and insert the new TRIE node */
2705 U8 nodetype =(U8)(flags & 0xFF);
2709 regnode *optimize = NULL;
2710 #ifdef RE_TRACK_PATTERN_OFFSETS
2713 U32 mjd_nodelen = 0;
2714 #endif /* RE_TRACK_PATTERN_OFFSETS */
2715 #endif /* DEBUGGING */
2717 This means we convert either the first branch or the first Exact,
2718 depending on whether the thing following (in 'last') is a branch
2719 or not and whther first is the startbranch (ie is it a sub part of
2720 the alternation or is it the whole thing.)
2721 Assuming its a sub part we convert the EXACT otherwise we convert
2722 the whole branch sequence, including the first.
2724 /* Find the node we are going to overwrite */
2725 if ( first != startbranch || OP( last ) == BRANCH ) {
2726 /* branch sub-chain */
2727 NEXT_OFF( first ) = (U16)(last - first);
2728 #ifdef RE_TRACK_PATTERN_OFFSETS
2730 mjd_offset= Node_Offset((convert));
2731 mjd_nodelen= Node_Length((convert));
2734 /* whole branch chain */
2736 #ifdef RE_TRACK_PATTERN_OFFSETS
2739 const regnode *nop = NEXTOPER( convert );
2740 mjd_offset= Node_Offset((nop));
2741 mjd_nodelen= Node_Length((nop));
2745 PerlIO_printf(Perl_debug_log,
2746 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2747 (int)depth * 2 + 2, "",
2748 (UV)mjd_offset, (UV)mjd_nodelen)
2751 /* But first we check to see if there is a common prefix we can
2752 split out as an EXACT and put in front of the TRIE node. */
2753 trie->startstate= 1;
2754 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2756 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2760 const U32 base = trie->states[ state ].trans.base;
2762 if ( trie->states[state].wordnum )
2765 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2766 if ( ( base + ofs >= trie->uniquecharcount ) &&
2767 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2768 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2770 if ( ++count > 1 ) {
2771 SV **tmp = av_fetch( revcharmap, ofs, 0);
2772 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2773 if ( state == 1 ) break;
2775 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2777 PerlIO_printf(Perl_debug_log,
2778 "%*sNew Start State=%"UVuf" Class: [",
2779 (int)depth * 2 + 2, "",
2782 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2783 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2785 TRIE_BITMAP_SET(trie,*ch);
2787 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2789 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2793 TRIE_BITMAP_SET(trie,*ch);
2795 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2796 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2802 SV **tmp = av_fetch( revcharmap, idx, 0);
2804 char *ch = SvPV( *tmp, len );
2806 SV *sv=sv_newmortal();
2807 PerlIO_printf( Perl_debug_log,
2808 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2809 (int)depth * 2 + 2, "",
2811 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2812 PL_colors[0], PL_colors[1],
2813 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2814 PERL_PV_ESCAPE_FIRSTCHAR
2819 OP( convert ) = nodetype;
2820 str=STRING(convert);
2823 STR_LEN(convert) += len;
2829 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2834 trie->prefixlen = (state-1);
2836 regnode *n = convert+NODE_SZ_STR(convert);
2837 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2838 trie->startstate = state;
2839 trie->minlen -= (state - 1);
2840 trie->maxlen -= (state - 1);
2842 /* At least the UNICOS C compiler choked on this
2843 * being argument to DEBUG_r(), so let's just have
2846 #ifdef PERL_EXT_RE_BUILD
2852 regnode *fix = convert;
2853 U32 word = trie->wordcount;
2855 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2856 while( ++fix < n ) {
2857 Set_Node_Offset_Length(fix, 0, 0);
2860 SV ** const tmp = av_fetch( trie_words, word, 0 );
2862 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2863 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2865 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2873 NEXT_OFF(convert) = (U16)(tail - convert);
2874 DEBUG_r(optimize= n);
2880 if ( trie->maxlen ) {
2881 NEXT_OFF( convert ) = (U16)(tail - convert);
2882 ARG_SET( convert, data_slot );
2883 /* Store the offset to the first unabsorbed branch in
2884 jump[0], which is otherwise unused by the jump logic.
2885 We use this when dumping a trie and during optimisation. */
2887 trie->jump[0] = (U16)(nextbranch - convert);
2889 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2890 * and there is a bitmap
2891 * and the first "jump target" node we found leaves enough room
2892 * then convert the TRIE node into a TRIEC node, with the bitmap
2893 * embedded inline in the opcode - this is hypothetically faster.
2895 if ( !trie->states[trie->startstate].wordnum
2897 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2899 OP( convert ) = TRIEC;
2900 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2901 PerlMemShared_free(trie->bitmap);
2904 OP( convert ) = TRIE;
2906 /* store the type in the flags */
2907 convert->flags = nodetype;
2911 + regarglen[ OP( convert ) ];
2913 /* XXX We really should free up the resource in trie now,
2914 as we won't use them - (which resources?) dmq */
2916 /* needed for dumping*/
2917 DEBUG_r(if (optimize) {
2918 regnode *opt = convert;
2920 while ( ++opt < optimize) {
2921 Set_Node_Offset_Length(opt,0,0);
2924 Try to clean up some of the debris left after the
2927 while( optimize < jumper ) {
2928 mjd_nodelen += Node_Length((optimize));
2929 OP( optimize ) = OPTIMIZED;
2930 Set_Node_Offset_Length(optimize,0,0);
2933 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2935 } /* end node insert */
2937 /* Finish populating the prev field of the wordinfo array. Walk back
2938 * from each accept state until we find another accept state, and if
2939 * so, point the first word's .prev field at the second word. If the
2940 * second already has a .prev field set, stop now. This will be the
2941 * case either if we've already processed that word's accept state,
2942 * or that state had multiple words, and the overspill words were
2943 * already linked up earlier.
2950 for (word=1; word <= trie->wordcount; word++) {
2952 if (trie->wordinfo[word].prev)
2954 state = trie->wordinfo[word].accept;
2956 state = prev_states[state];
2959 prev = trie->states[state].wordnum;
2963 trie->wordinfo[word].prev = prev;
2965 Safefree(prev_states);
2969 /* and now dump out the compressed format */
2970 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2972 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2974 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2975 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2977 SvREFCNT_dec_NN(revcharmap);
2981 : trie->startstate>1
2987 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2989 /* The Trie is constructed and compressed now so we can build a fail array if
2992 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2994 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2998 We find the fail state for each state in the trie, this state is the longest
2999 proper suffix of the current state's 'word' that is also a proper prefix of
3000 another word in our trie. State 1 represents the word '' and is thus the
3001 default fail state. This allows the DFA not to have to restart after its
3002 tried and failed a word at a given point, it simply continues as though it
3003 had been matching the other word in the first place.
3005 'abcdgu'=~/abcdefg|cdgu/
3006 When we get to 'd' we are still matching the first word, we would encounter
3007 'g' which would fail, which would bring us to the state representing 'd' in
3008 the second word where we would try 'g' and succeed, proceeding to match
3011 /* add a fail transition */
3012 const U32 trie_offset = ARG(source);
3013 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3015 const U32 ucharcount = trie->uniquecharcount;
3016 const U32 numstates = trie->statecount;
3017 const U32 ubound = trie->lasttrans + ucharcount;
3021 U32 base = trie->states[ 1 ].trans.base;
3024 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3026 GET_RE_DEBUG_FLAGS_DECL;
3028 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3029 PERL_UNUSED_CONTEXT;
3031 PERL_UNUSED_ARG(depth);
3034 if ( OP(source) == TRIE ) {
3035 struct regnode_1 *op = (struct regnode_1 *)
3036 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3037 StructCopy(source,op,struct regnode_1);
3038 stclass = (regnode *)op;
3040 struct regnode_charclass *op = (struct regnode_charclass *)
3041 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3042 StructCopy(source,op,struct regnode_charclass);
3043 stclass = (regnode *)op;
3045 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3047 ARG_SET( stclass, data_slot );
3048 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3049 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3050 aho->trie=trie_offset;
3051 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3052 Copy( trie->states, aho->states, numstates, reg_trie_state );
3053 Newxz( q, numstates, U32);
3054 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3057 /* initialize fail[0..1] to be 1 so that we always have
3058 a valid final fail state */
3059 fail[ 0 ] = fail[ 1 ] = 1;
3061 for ( charid = 0; charid < ucharcount ; charid++ ) {
3062 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3064 q[ q_write ] = newstate;
3065 /* set to point at the root */
3066 fail[ q[ q_write++ ] ]=1;
3069 while ( q_read < q_write) {
3070 const U32 cur = q[ q_read++ % numstates ];
3071 base = trie->states[ cur ].trans.base;
3073 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3074 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3076 U32 fail_state = cur;
3079 fail_state = fail[ fail_state ];
3080 fail_base = aho->states[ fail_state ].trans.base;
3081 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3083 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3084 fail[ ch_state ] = fail_state;
3085 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3087 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3089 q[ q_write++ % numstates] = ch_state;
3093 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3094 when we fail in state 1, this allows us to use the
3095 charclass scan to find a valid start char. This is based on the principle
3096 that theres a good chance the string being searched contains lots of stuff
3097 that cant be a start char.
3099 fail[ 0 ] = fail[ 1 ] = 0;
3100 DEBUG_TRIE_COMPILE_r({
3101 PerlIO_printf(Perl_debug_log,
3102 "%*sStclass Failtable (%"UVuf" states): 0",
3103 (int)(depth * 2), "", (UV)numstates
3105 for( q_read=1; q_read<numstates; q_read++ ) {
3106 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3108 PerlIO_printf(Perl_debug_log, "\n");
3111 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3116 #define DEBUG_PEEP(str,scan,depth) \
3117 DEBUG_OPTIMISE_r({if (scan){ \
3118 SV * const mysv=sv_newmortal(); \
3119 regnode *Next = regnext(scan); \
3120 regprop(RExC_rx, mysv, scan, NULL); \
3121 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3122 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3123 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3127 /* The below joins as many adjacent EXACTish nodes as possible into a single
3128 * one. The regop may be changed if the node(s) contain certain sequences that
3129 * require special handling. The joining is only done if:
3130 * 1) there is room in the current conglomerated node to entirely contain the
3132 * 2) they are the exact same node type
3134 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3135 * these get optimized out
3137 * If a node is to match under /i (folded), the number of characters it matches
3138 * can be different than its character length if it contains a multi-character
3139 * fold. *min_subtract is set to the total delta number of characters of the
3142 * And *unfolded_multi_char is set to indicate whether or not the node contains
3143 * an unfolded multi-char fold. This happens when whether the fold is valid or
3144 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3145 * SMALL LETTER SHARP S, as only if the target string being matched against
3146 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3147 * folding rules depend on the locale in force at runtime. (Multi-char folds
3148 * whose components are all above the Latin1 range are not run-time locale
3149 * dependent, and have already been folded by the time this function is
3152 * This is as good a place as any to discuss the design of handling these
3153 * multi-character fold sequences. It's been wrong in Perl for a very long
3154 * time. There are three code points in Unicode whose multi-character folds
3155 * were long ago discovered to mess things up. The previous designs for
3156 * dealing with these involved assigning a special node for them. This
3157 * approach doesn't always work, as evidenced by this example:
3158 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3159 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3160 * would match just the \xDF, it won't be able to handle the case where a
3161 * successful match would have to cross the node's boundary. The new approach
3162 * that hopefully generally solves the problem generates an EXACTFU_SS node
3163 * that is "sss" in this case.
3165 * It turns out that there are problems with all multi-character folds, and not
3166 * just these three. Now the code is general, for all such cases. The
3167 * approach taken is:
3168 * 1) This routine examines each EXACTFish node that could contain multi-
3169 * character folded sequences. Since a single character can fold into
3170 * such a sequence, the minimum match length for this node is less than
3171 * the number of characters in the node. This routine returns in
3172 * *min_subtract how many characters to subtract from the the actual
3173 * length of the string to get a real minimum match length; it is 0 if
3174 * there are no multi-char foldeds. This delta is used by the caller to
3175 * adjust the min length of the match, and the delta between min and max,
3176 * so that the optimizer doesn't reject these possibilities based on size
3178 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3179 * is used for an EXACTFU node that contains at least one "ss" sequence in
3180 * it. For non-UTF-8 patterns and strings, this is the only case where
3181 * there is a possible fold length change. That means that a regular
3182 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3183 * with length changes, and so can be processed faster. regexec.c takes
3184 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3185 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3186 * known until runtime). This saves effort in regex matching. However,
3187 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3188 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3189 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3190 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3191 * possibilities for the non-UTF8 patterns are quite simple, except for
3192 * the sharp s. All the ones that don't involve a UTF-8 target string are
3193 * members of a fold-pair, and arrays are set up for all of them so that
3194 * the other member of the pair can be found quickly. Code elsewhere in
3195 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3196 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3197 * described in the next item.
3198 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3199 * validity of the fold won't be known until runtime, and so must remain
3200 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3201 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3202 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3203 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3204 * The reason this is a problem is that the optimizer part of regexec.c
3205 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3206 * that a character in the pattern corresponds to at most a single
3207 * character in the target string. (And I do mean character, and not byte
3208 * here, unlike other parts of the documentation that have never been
3209 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3210 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3211 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3212 * nodes, violate the assumption, and they are the only instances where it
3213 * is violated. I'm reluctant to try to change the assumption, as the
3214 * code involved is impenetrable to me (khw), so instead the code here
3215 * punts. This routine examines EXACTFL nodes, and (when the pattern
3216 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3217 * boolean indicating whether or not the node contains such a fold. When
3218 * it is true, the caller sets a flag that later causes the optimizer in
3219 * this file to not set values for the floating and fixed string lengths,
3220 * and thus avoids the optimizer code in regexec.c that makes the invalid
3221 * assumption. Thus, there is no optimization based on string lengths for
3222 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3223 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3224 * assumption is wrong only in these cases is that all other non-UTF-8
3225 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3226 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3227 * EXACTF nodes because we don't know at compile time if it actually
3228 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3229 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3230 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3231 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3232 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3233 * string would require the pattern to be forced into UTF-8, the overhead
3234 * of which we want to avoid. Similarly the unfolded multi-char folds in
3235 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3238 * Similarly, the code that generates tries doesn't currently handle
3239 * not-already-folded multi-char folds, and it looks like a pain to change
3240 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3241 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3242 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3243 * using /iaa matching will be doing so almost entirely with ASCII
3244 * strings, so this should rarely be encountered in practice */
3246 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3247 if (PL_regkind[OP(scan)] == EXACT) \
3248 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3251 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3252 UV *min_subtract, bool *unfolded_multi_char,
3253 U32 flags,regnode *val, U32 depth)
3255 /* Merge several consecutive EXACTish nodes into one. */
3256 regnode *n = regnext(scan);
3258 regnode *next = scan + NODE_SZ_STR(scan);
3262 regnode *stop = scan;
3263 GET_RE_DEBUG_FLAGS_DECL;
3265 PERL_UNUSED_ARG(depth);
3268 PERL_ARGS_ASSERT_JOIN_EXACT;
3269 #ifndef EXPERIMENTAL_INPLACESCAN
3270 PERL_UNUSED_ARG(flags);
3271 PERL_UNUSED_ARG(val);
3273 DEBUG_PEEP("join",scan,depth);
3275 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3276 * EXACT ones that are mergeable to the current one. */
3278 && (PL_regkind[OP(n)] == NOTHING
3279 || (stringok && OP(n) == OP(scan)))
3281 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3284 if (OP(n) == TAIL || n > next)
3286 if (PL_regkind[OP(n)] == NOTHING) {
3287 DEBUG_PEEP("skip:",n,depth);
3288 NEXT_OFF(scan) += NEXT_OFF(n);
3289 next = n + NODE_STEP_REGNODE;
3296 else if (stringok) {
3297 const unsigned int oldl = STR_LEN(scan);
3298 regnode * const nnext = regnext(n);
3300 /* XXX I (khw) kind of doubt that this works on platforms (should
3301 * Perl ever run on one) where U8_MAX is above 255 because of lots
3302 * of other assumptions */
3303 /* Don't join if the sum can't fit into a single node */
3304 if (oldl + STR_LEN(n) > U8_MAX)
3307 DEBUG_PEEP("merg",n,depth);
3310 NEXT_OFF(scan) += NEXT_OFF(n);
3311 STR_LEN(scan) += STR_LEN(n);
3312 next = n + NODE_SZ_STR(n);
3313 /* Now we can overwrite *n : */
3314 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3322 #ifdef EXPERIMENTAL_INPLACESCAN
3323 if (flags && !NEXT_OFF(n)) {
3324 DEBUG_PEEP("atch", val, depth);
3325 if (reg_off_by_arg[OP(n)]) {
3326 ARG_SET(n, val - n);
3329 NEXT_OFF(n) = val - n;
3337 *unfolded_multi_char = FALSE;
3339 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3340 * can now analyze for sequences of problematic code points. (Prior to
3341 * this final joining, sequences could have been split over boundaries, and
3342 * hence missed). The sequences only happen in folding, hence for any
3343 * non-EXACT EXACTish node */
3344 if (OP(scan) != EXACT) {
3345 U8* s0 = (U8*) STRING(scan);
3347 U8* s_end = s0 + STR_LEN(scan);
3349 int total_count_delta = 0; /* Total delta number of characters that
3350 multi-char folds expand to */
3352 /* One pass is made over the node's string looking for all the
3353 * possibilities. To avoid some tests in the loop, there are two main
3354 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3359 if (OP(scan) == EXACTFL) {
3362 /* An EXACTFL node would already have been changed to another
3363 * node type unless there is at least one character in it that
3364 * is problematic; likely a character whose fold definition
3365 * won't be known until runtime, and so has yet to be folded.
3366 * For all but the UTF-8 locale, folds are 1-1 in length, but
3367 * to handle the UTF-8 case, we need to create a temporary
3368 * folded copy using UTF-8 locale rules in order to analyze it.
3369 * This is because our macros that look to see if a sequence is
3370 * a multi-char fold assume everything is folded (otherwise the
3371 * tests in those macros would be too complicated and slow).
3372 * Note that here, the non-problematic folds will have already
3373 * been done, so we can just copy such characters. We actually
3374 * don't completely fold the EXACTFL string. We skip the
3375 * unfolded multi-char folds, as that would just create work
3376 * below to figure out the size they already are */
3378 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3381 STRLEN s_len = UTF8SKIP(s);
3382 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3383 Copy(s, d, s_len, U8);
3386 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3387 *unfolded_multi_char = TRUE;
3388 Copy(s, d, s_len, U8);
3391 else if (isASCII(*s)) {
3392 *(d++) = toFOLD(*s);
3396 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3402 /* Point the remainder of the routine to look at our temporary
3406 } /* End of creating folded copy of EXACTFL string */
3408 /* Examine the string for a multi-character fold sequence. UTF-8
3409 * patterns have all characters pre-folded by the time this code is
3411 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3412 length sequence we are looking for is 2 */
3414 int count = 0; /* How many characters in a multi-char fold */
3415 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3416 if (! len) { /* Not a multi-char fold: get next char */
3421 /* Nodes with 'ss' require special handling, except for
3422 * EXACTFA-ish for which there is no multi-char fold to this */
3423 if (len == 2 && *s == 's' && *(s+1) == 's'
3424 && OP(scan) != EXACTFA
3425 && OP(scan) != EXACTFA_NO_TRIE)
3428 if (OP(scan) != EXACTFL) {
3429 OP(scan) = EXACTFU_SS;
3433 else { /* Here is a generic multi-char fold. */
3434 U8* multi_end = s + len;
3436 /* Count how many characters are in it. In the case of
3437 * /aa, no folds which contain ASCII code points are
3438 * allowed, so check for those, and skip if found. */
3439 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3440 count = utf8_length(s, multi_end);
3444 while (s < multi_end) {
3447 goto next_iteration;
3457 /* The delta is how long the sequence is minus 1 (1 is how long
3458 * the character that folds to the sequence is) */
3459 total_count_delta += count - 1;
3463 /* We created a temporary folded copy of the string in EXACTFL
3464 * nodes. Therefore we need to be sure it doesn't go below zero,
3465 * as the real string could be shorter */
3466 if (OP(scan) == EXACTFL) {
3467 int total_chars = utf8_length((U8*) STRING(scan),
3468 (U8*) STRING(scan) + STR_LEN(scan));
3469 if (total_count_delta > total_chars) {
3470 total_count_delta = total_chars;
3474 *min_subtract += total_count_delta;
3477 else if (OP(scan) == EXACTFA) {
3479 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3480 * fold to the ASCII range (and there are no existing ones in the
3481 * upper latin1 range). But, as outlined in the comments preceding
3482 * this function, we need to flag any occurrences of the sharp s.
3483 * This character forbids trie formation (because of added
3486 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3487 OP(scan) = EXACTFA_NO_TRIE;
3488 *unfolded_multi_char = TRUE;
3497 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3498 * folds that are all Latin1. As explained in the comments
3499 * preceding this function, we look also for the sharp s in EXACTF
3500 * and EXACTFL nodes; it can be in the final position. Otherwise
3501 * we can stop looking 1 byte earlier because have to find at least
3502 * two characters for a multi-fold */
3503 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3508 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3509 if (! len) { /* Not a multi-char fold. */
3510 if (*s == LATIN_SMALL_LETTER_SHARP_S
3511 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3513 *unfolded_multi_char = TRUE;
3520 && isARG2_lower_or_UPPER_ARG1('s', *s)
3521 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3524 /* EXACTF nodes need to know that the minimum length
3525 * changed so that a sharp s in the string can match this
3526 * ss in the pattern, but they remain EXACTF nodes, as they
3527 * won't match this unless the target string is is UTF-8,
3528 * which we don't know until runtime. EXACTFL nodes can't
3529 * transform into EXACTFU nodes */
3530 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3531 OP(scan) = EXACTFU_SS;
3535 *min_subtract += len - 1;
3542 /* Allow dumping but overwriting the collection of skipped
3543 * ops and/or strings with fake optimized ops */
3544 n = scan + NODE_SZ_STR(scan);
3552 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3556 /* REx optimizer. Converts nodes into quicker variants "in place".
3557 Finds fixed substrings. */
3559 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3560 to the position after last scanned or to NULL. */
3562 #define INIT_AND_WITHP \
3563 assert(!and_withp); \
3564 Newx(and_withp,1, regnode_ssc); \
3565 SAVEFREEPV(and_withp)
3567 /* this is a chain of data about sub patterns we are processing that
3568 need to be handled separately/specially in study_chunk. Its so
3569 we can simulate recursion without losing state. */
3571 typedef struct scan_frame {
3572 regnode *last; /* last node to process in this frame */
3573 regnode *next; /* next node to process when last is reached */
3574 struct scan_frame *prev; /*previous frame*/
3575 U32 prev_recursed_depth;
3576 I32 stop; /* what stopparen do we use */
3581 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3582 SSize_t *minlenp, SSize_t *deltap,
3587 regnode_ssc *and_withp,
3588 U32 flags, U32 depth)
3589 /* scanp: Start here (read-write). */
3590 /* deltap: Write maxlen-minlen here. */
3591 /* last: Stop before this one. */
3592 /* data: string data about the pattern */
3593 /* stopparen: treat close N as END */
3594 /* recursed: which subroutines have we recursed into */
3595 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3598 /* There must be at least this number of characters to match */
3601 regnode *scan = *scanp, *next;
3603 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3604 int is_inf_internal = 0; /* The studied chunk is infinite */
3605 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3606 scan_data_t data_fake;
3607 SV *re_trie_maxbuff = NULL;
3608 regnode *first_non_open = scan;
3609 SSize_t stopmin = SSize_t_MAX;
3610 scan_frame *frame = NULL;
3611 GET_RE_DEBUG_FLAGS_DECL;
3613 PERL_ARGS_ASSERT_STUDY_CHUNK;
3616 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3619 while (first_non_open && OP(first_non_open) == OPEN)
3620 first_non_open=regnext(first_non_open);
3625 while ( scan && OP(scan) != END && scan < last ){
3626 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3627 node length to get a real minimum (because
3628 the folded version may be shorter) */
3629 bool unfolded_multi_char = FALSE;
3630 /* Peephole optimizer: */
3631 DEBUG_OPTIMISE_MORE_r(
3633 PerlIO_printf(Perl_debug_log,
3634 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3635 ((int) depth*2), "", (long)stopparen,
3636 (unsigned long)depth, (unsigned long)recursed_depth);
3637 if (recursed_depth) {
3640 for ( j = 0 ; j < recursed_depth ; j++ ) {
3641 PerlIO_printf(Perl_debug_log,"[");
3642 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3643 PerlIO_printf(Perl_debug_log,"%d",
3644 PAREN_TEST(RExC_study_chunk_recursed +
3645 (j * RExC_study_chunk_recursed_bytes), i)
3648 PerlIO_printf(Perl_debug_log,"]");
3651 PerlIO_printf(Perl_debug_log,"\n");
3654 DEBUG_STUDYDATA("Peep:", data, depth);
3655 DEBUG_PEEP("Peep", scan, depth);
3658 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3659 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3660 * by a different invocation of reg() -- Yves
3662 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3664 /* Follow the next-chain of the current node and optimize
3665 away all the NOTHINGs from it. */
3666 if (OP(scan) != CURLYX) {
3667 const int max = (reg_off_by_arg[OP(scan)]
3669 /* I32 may be smaller than U16 on CRAYs! */
3670 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3671 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3675 /* Skip NOTHING and LONGJMP. */
3676 while ((n = regnext(n))
3677 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3678 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3679 && off + noff < max)
3681 if (reg_off_by_arg[OP(scan)])
3684 NEXT_OFF(scan) = off;
3689 /* The principal pseudo-switch. Cannot be a switch, since we
3690 look into several different things. */
3691 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3692 || OP(scan) == IFTHEN) {
3693 next = regnext(scan);
3695 /* demq: the op(next)==code check is to see if we have
3696 * "branch-branch" AFAICT */
3698 if (OP(next) == code || code == IFTHEN) {
3699 /* NOTE - There is similar code to this block below for
3700 * handling TRIE nodes on a re-study. If you change stuff here
3701 * check there too. */
3702 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3704 regnode * const startbranch=scan;
3706 if (flags & SCF_DO_SUBSTR) {
3707 /* Cannot merge strings after this. */
3708 scan_commit(pRExC_state, data, minlenp, is_inf);
3711 if (flags & SCF_DO_STCLASS)
3712 ssc_init_zero(pRExC_state, &accum);
3714 while (OP(scan) == code) {
3715 SSize_t deltanext, minnext, fake;
3717 regnode_ssc this_class;
3720 data_fake.flags = 0;
3722 data_fake.whilem_c = data->whilem_c;
3723 data_fake.last_closep = data->last_closep;
3726 data_fake.last_closep = &fake;
3728 data_fake.pos_delta = delta;
3729 next = regnext(scan);
3730 scan = NEXTOPER(scan);
3732 scan = NEXTOPER(scan);
3733 if (flags & SCF_DO_STCLASS) {
3734 ssc_init(pRExC_state, &this_class);
3735 data_fake.start_class = &this_class;
3736 f = SCF_DO_STCLASS_AND;
3738 if (flags & SCF_WHILEM_VISITED_POS)
3739 f |= SCF_WHILEM_VISITED_POS;
3741 /* we suppose the run is continuous, last=next...*/
3742 minnext = study_chunk(pRExC_state, &scan, minlenp,
3743 &deltanext, next, &data_fake, stopparen,
3744 recursed_depth, NULL, f,depth+1);
3747 if (deltanext == SSize_t_MAX) {
3748 is_inf = is_inf_internal = 1;
3750 } else if (max1 < minnext + deltanext)
3751 max1 = minnext + deltanext;
3753 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3755 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3756 if ( stopmin > minnext)
3757 stopmin = min + min1;
3758 flags &= ~SCF_DO_SUBSTR;
3760 data->flags |= SCF_SEEN_ACCEPT;
3763 if (data_fake.flags & SF_HAS_EVAL)
3764 data->flags |= SF_HAS_EVAL;
3765 data->whilem_c = data_fake.whilem_c;
3767 if (flags & SCF_DO_STCLASS)
3768 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3770 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3772 if (flags & SCF_DO_SUBSTR) {
3773 data->pos_min += min1;
3774 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3775 data->pos_delta = SSize_t_MAX;
3777 data->pos_delta += max1 - min1;
3778 if (max1 != min1 || is_inf)
3779 data->longest = &(data->longest_float);
3782 if (delta == SSize_t_MAX
3783 || SSize_t_MAX - delta - (max1 - min1) < 0)
3784 delta = SSize_t_MAX;
3786 delta += max1 - min1;
3787 if (flags & SCF_DO_STCLASS_OR) {
3788 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3790 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3791 flags &= ~SCF_DO_STCLASS;
3794 else if (flags & SCF_DO_STCLASS_AND) {
3796 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3797 flags &= ~SCF_DO_STCLASS;
3800 /* Switch to OR mode: cache the old value of
3801 * data->start_class */
3803 StructCopy(data->start_class, and_withp, regnode_ssc);
3804 flags &= ~SCF_DO_STCLASS_AND;
3805 StructCopy(&accum, data->start_class, regnode_ssc);
3806 flags |= SCF_DO_STCLASS_OR;
3810 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3811 OP( startbranch ) == BRANCH )
3815 Assuming this was/is a branch we are dealing with: 'scan'
3816 now points at the item that follows the branch sequence,
3817 whatever it is. We now start at the beginning of the
3818 sequence and look for subsequences of
3824 which would be constructed from a pattern like
3827 If we can find such a subsequence we need to turn the first
3828 element into a trie and then add the subsequent branch exact
3829 strings to the trie.
3833 1. patterns where the whole set of branches can be
3836 2. patterns where only a subset can be converted.
3838 In case 1 we can replace the whole set with a single regop
3839 for the trie. In case 2 we need to keep the start and end
3842 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3843 becomes BRANCH TRIE; BRANCH X;
3845 There is an additional case, that being where there is a
3846 common prefix, which gets split out into an EXACT like node
3847 preceding the TRIE node.
3849 If x(1..n)==tail then we can do a simple trie, if not we make
3850 a "jump" trie, such that when we match the appropriate word
3851 we "jump" to the appropriate tail node. Essentially we turn
3852 a nested if into a case structure of sorts.
3857 if (!re_trie_maxbuff) {
3858 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3859 if (!SvIOK(re_trie_maxbuff))
3860 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3862 if ( SvIV(re_trie_maxbuff)>=0 ) {
3864 regnode *first = (regnode *)NULL;
3865 regnode *last = (regnode *)NULL;
3866 regnode *tail = scan;
3871 SV * const mysv = sv_newmortal(); /* for dumping */
3873 /* var tail is used because there may be a TAIL
3874 regop in the way. Ie, the exacts will point to the
3875 thing following the TAIL, but the last branch will
3876 point at the TAIL. So we advance tail. If we
3877 have nested (?:) we may have to move through several
3881 while ( OP( tail ) == TAIL ) {
3882 /* this is the TAIL generated by (?:) */
3883 tail = regnext( tail );
3887 DEBUG_TRIE_COMPILE_r({
3888 regprop(RExC_rx, mysv, tail, NULL);
3889 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3890 (int)depth * 2 + 2, "",
3891 "Looking for TRIE'able sequences. Tail node is: ",
3892 SvPV_nolen_const( mysv )
3898 Step through the branches
3899 cur represents each branch,
3900 noper is the first thing to be matched as part
3902 noper_next is the regnext() of that node.
3904 We normally handle a case like this
3905 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3906 support building with NOJUMPTRIE, which restricts
3907 the trie logic to structures like /FOO|BAR/.
3909 If noper is a trieable nodetype then the branch is
3910 a possible optimization target. If we are building
3911 under NOJUMPTRIE then we require that noper_next is
3912 the same as scan (our current position in the regex
3915 Once we have two or more consecutive such branches
3916 we can create a trie of the EXACT's contents and
3917 stitch it in place into the program.
3919 If the sequence represents all of the branches in
3920 the alternation we replace the entire thing with a
3923 Otherwise when it is a subsequence we need to
3924 stitch it in place and replace only the relevant
3925 branches. This means the first branch has to remain
3926 as it is used by the alternation logic, and its
3927 next pointer, and needs to be repointed at the item
3928 on the branch chain following the last branch we
3929 have optimized away.
3931 This could be either a BRANCH, in which case the
3932 subsequence is internal, or it could be the item
3933 following the branch sequence in which case the
3934 subsequence is at the end (which does not
3935 necessarily mean the first node is the start of the
3938 TRIE_TYPE(X) is a define which maps the optype to a
3942 ----------------+-----------
3946 EXACTFU_SS | EXACTFU
3951 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3952 ( EXACT == (X) ) ? EXACT : \
3953 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3954 ( EXACTFA == (X) ) ? EXACTFA : \
3957 /* dont use tail as the end marker for this traverse */
3958 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3959 regnode * const noper = NEXTOPER( cur );
3960 U8 noper_type = OP( noper );
3961 U8 noper_trietype = TRIE_TYPE( noper_type );
3962 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3963 regnode * const noper_next = regnext( noper );
3964 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3965 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3968 DEBUG_TRIE_COMPILE_r({
3969 regprop(RExC_rx, mysv, cur, NULL);
3970 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3971 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3973 regprop(RExC_rx, mysv, noper, NULL);
3974 PerlIO_printf( Perl_debug_log, " -> %s",
3975 SvPV_nolen_const(mysv));
3978 regprop(RExC_rx, mysv, noper_next, NULL);
3979 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3980 SvPV_nolen_const(mysv));
3982 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3983 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3984 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3988 /* Is noper a trieable nodetype that can be merged
3989 * with the current trie (if there is one)? */
3993 ( noper_trietype == NOTHING)
3994 || ( trietype == NOTHING )
3995 || ( trietype == noper_trietype )
3998 && noper_next == tail
4002 /* Handle mergable triable node Either we are
4003 * the first node in a new trieable sequence,
4004 * in which case we do some bookkeeping,
4005 * otherwise we update the end pointer. */
4008 if ( noper_trietype == NOTHING ) {
4009 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4010 regnode * const noper_next = regnext( noper );
4011 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4012 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4015 if ( noper_next_trietype ) {
4016 trietype = noper_next_trietype;
4017 } else if (noper_next_type) {
4018 /* a NOTHING regop is 1 regop wide.
4019 * We need at least two for a trie
4020 * so we can't merge this in */
4024 trietype = noper_trietype;
4027 if ( trietype == NOTHING )
4028 trietype = noper_trietype;
4033 } /* end handle mergable triable node */
4035 /* handle unmergable node -
4036 * noper may either be a triable node which can
4037 * not be tried together with the current trie,
4038 * or a non triable node */
4040 /* If last is set and trietype is not
4041 * NOTHING then we have found at least two
4042 * triable branch sequences in a row of a
4043 * similar trietype so we can turn them
4044 * into a trie. If/when we allow NOTHING to
4045 * start a trie sequence this condition
4046 * will be required, and it isn't expensive
4047 * so we leave it in for now. */
4048 if ( trietype && trietype != NOTHING )
4049 make_trie( pRExC_state,
4050 startbranch, first, cur, tail,
4051 count, trietype, depth+1 );
4052 last = NULL; /* note: we clear/update
4053 first, trietype etc below,
4054 so we dont do it here */
4058 && noper_next == tail
4061 /* noper is triable, so we can start a new
4065 trietype = noper_trietype;
4067 /* if we already saw a first but the
4068 * current node is not triable then we have
4069 * to reset the first information. */
4074 } /* end handle unmergable node */
4075 } /* loop over branches */
4076 DEBUG_TRIE_COMPILE_r({
4077 regprop(RExC_rx, mysv, cur, NULL);
4078 PerlIO_printf( Perl_debug_log,
4079 "%*s- %s (%d) <SCAN FINISHED>\n",
4081 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4084 if ( last && trietype ) {
4085 if ( trietype != NOTHING ) {
4086 /* the last branch of the sequence was part of
4087 * a trie, so we have to construct it here
4088 * outside of the loop */
4089 made= make_trie( pRExC_state, startbranch,
4090 first, scan, tail, count,
4091 trietype, depth+1 );
4092 #ifdef TRIE_STUDY_OPT
4093 if ( ((made == MADE_EXACT_TRIE &&
4094 startbranch == first)
4095 || ( first_non_open == first )) &&
4097 flags |= SCF_TRIE_RESTUDY;
4098 if ( startbranch == first
4101 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4106 /* at this point we know whatever we have is a
4107 * NOTHING sequence/branch AND if 'startbranch'
4108 * is 'first' then we can turn the whole thing
4111 if ( startbranch == first ) {
4113 /* the entire thing is a NOTHING sequence,
4114 * something like this: (?:|) So we can
4115 * turn it into a plain NOTHING op. */
4116 DEBUG_TRIE_COMPILE_r({
4117 regprop(RExC_rx, mysv, cur, NULL);
4118 PerlIO_printf( Perl_debug_log,
4119 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4120 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4123 OP(startbranch)= NOTHING;
4124 NEXT_OFF(startbranch)= tail - startbranch;
4125 for ( opt= startbranch + 1; opt < tail ; opt++ )
4129 } /* end if ( last) */
4130 } /* TRIE_MAXBUF is non zero */
4135 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4136 scan = NEXTOPER(NEXTOPER(scan));
4137 } else /* single branch is optimized. */
4138 scan = NEXTOPER(scan);
4140 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4141 scan_frame *newframe = NULL;
4145 U32 my_recursed_depth= recursed_depth;
4147 if (OP(scan) != SUSPEND) {
4148 /* set the pointer */
4149 if (OP(scan) == GOSUB) {
4151 RExC_recurse[ARG2L(scan)] = scan;
4152 start = RExC_open_parens[paren-1];
4153 end = RExC_close_parens[paren-1];
4156 start = RExC_rxi->program + 1;
4161 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4163 if (!recursed_depth) {
4164 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4166 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4167 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4168 RExC_study_chunk_recursed_bytes, U8);
4170 /* we havent recursed into this paren yet, so recurse into it */
4171 DEBUG_STUDYDATA("set:", data,depth);
4172 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4173 my_recursed_depth= recursed_depth + 1;
4174 Newx(newframe,1,scan_frame);
4176 DEBUG_STUDYDATA("inf:", data,depth);
4177 /* some form of infinite recursion, assume infinite length
4179 if (flags & SCF_DO_SUBSTR) {
4180 scan_commit(pRExC_state, data, minlenp, is_inf);
4181 data->longest = &(data->longest_float);
4183 is_inf = is_inf_internal = 1;
4184 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4185 ssc_anything(data->start_class);
4186 flags &= ~SCF_DO_STCLASS;
4189 Newx(newframe,1,scan_frame);
4192 end = regnext(scan);
4197 SAVEFREEPV(newframe);
4198 newframe->next = regnext(scan);
4199 newframe->last = last;
4200 newframe->stop = stopparen;
4201 newframe->prev = frame;
4202 newframe->prev_recursed_depth = recursed_depth;
4204 DEBUG_STUDYDATA("frame-new:",data,depth);
4205 DEBUG_PEEP("fnew", scan, depth);
4212 recursed_depth= my_recursed_depth;
4217 else if (OP(scan) == EXACT) {
4218 SSize_t l = STR_LEN(scan);
4221 const U8 * const s = (U8*)STRING(scan);
4222 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4223 l = utf8_length(s, s + l);
4225 uc = *((U8*)STRING(scan));
4228 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4229 /* The code below prefers earlier match for fixed
4230 offset, later match for variable offset. */
4231 if (data->last_end == -1) { /* Update the start info. */
4232 data->last_start_min = data->pos_min;
4233 data->last_start_max = is_inf
4234 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4236 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4238 SvUTF8_on(data->last_found);
4240 SV * const sv = data->last_found;
4241 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4242 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4243 if (mg && mg->mg_len >= 0)
4244 mg->mg_len += utf8_length((U8*)STRING(scan),
4245 (U8*)STRING(scan)+STR_LEN(scan));
4247 data->last_end = data->pos_min + l;
4248 data->pos_min += l; /* As in the first entry. */
4249 data->flags &= ~SF_BEFORE_EOL;
4252 /* ANDing the code point leaves at most it, and not in locale, and
4253 * can't match null string */
4254 if (flags & SCF_DO_STCLASS_AND) {
4255 ssc_cp_and(data->start_class, uc);
4256 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4257 ssc_clear_locale(data->start_class);
4259 else if (flags & SCF_DO_STCLASS_OR) {
4260 ssc_add_cp(data->start_class, uc);
4261 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4263 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4264 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4266 flags &= ~SCF_DO_STCLASS;
4268 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4270 SSize_t l = STR_LEN(scan);
4271 UV uc = *((U8*)STRING(scan));
4272 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4273 separate code points */
4274 const U8 * s = (U8*)STRING(scan);
4276 /* Search for fixed substrings supports EXACT only. */
4277 if (flags & SCF_DO_SUBSTR) {
4279 scan_commit(pRExC_state, data, minlenp, is_inf);
4282 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4283 l = utf8_length(s, s + l);
4285 if (unfolded_multi_char) {
4286 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4288 min += l - min_subtract;
4290 delta += min_subtract;
4291 if (flags & SCF_DO_SUBSTR) {
4292 data->pos_min += l - min_subtract;
4293 if (data->pos_min < 0) {
4296 data->pos_delta += min_subtract;
4298 data->longest = &(data->longest_float);
4302 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4303 ssc_clear_locale(data->start_class);
4308 /* We punt and assume can match anything if the node begins
4309 * with a multi-character fold. Things are complicated. For
4310 * example, /ffi/i could match any of:
4311 * "\N{LATIN SMALL LIGATURE FFI}"
4312 * "\N{LATIN SMALL LIGATURE FF}I"
4313 * "F\N{LATIN SMALL LIGATURE FI}"
4314 * plus several other things; and making sure we have all the
4315 * possibilities is hard. */
4316 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4318 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4322 /* Any Latin1 range character can potentially match any
4323 * other depending on the locale */
4324 if (OP(scan) == EXACTFL) {
4325 _invlist_union(EXACTF_invlist, PL_Latin1,
4329 /* But otherwise, it matches at least itself. We can
4330 * quickly tell if it has a distinct fold, and if so,
4331 * it matches that as well */
4332 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4333 if (IS_IN_SOME_FOLD_L1(uc)) {
4334 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4335 PL_fold_latin1[uc]);
4339 /* Some characters match above-Latin1 ones under /i. This
4340 * is true of EXACTFL ones when the locale is UTF-8 */
4341 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4342 && (! isASCII(uc) || (OP(scan) != EXACTFA
4343 && OP(scan) != EXACTFA_NO_TRIE)))
4345 add_above_Latin1_folds(pRExC_state,
4351 else { /* Pattern is UTF-8 */
4352 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4353 STRLEN foldlen = UTF8SKIP(s);
4354 const U8* e = s + STR_LEN(scan);
4357 /* The only code points that aren't folded in a UTF EXACTFish
4358 * node are are the problematic ones in EXACTFL nodes */
4359 if (OP(scan) == EXACTFL
4360 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4362 /* We need to check for the possibility that this EXACTFL
4363 * node begins with a multi-char fold. Therefore we fold
4364 * the first few characters of it so that we can make that
4369 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4371 *(d++) = (U8) toFOLD(*s);
4376 to_utf8_fold(s, d, &len);
4382 /* And set up so the code below that looks in this folded
4383 * buffer instead of the node's string */
4385 foldlen = UTF8SKIP(folded);
4389 /* When we reach here 's' points to the fold of the first
4390 * character(s) of the node; and 'e' points to far enough along
4391 * the folded string to be just past any possible multi-char
4392 * fold. 'foldlen' is the length in bytes of the first
4395 * Unlike the non-UTF-8 case, the macro for determining if a
4396 * string is a multi-char fold requires all the characters to
4397 * already be folded. This is because of all the complications
4398 * if not. Note that they are folded anyway, except in EXACTFL
4399 * nodes. Like the non-UTF case above, we punt if the node
4400 * begins with a multi-char fold */
4402 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4404 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4406 else { /* Single char fold */
4408 /* It matches all the things that fold to it, which are
4409 * found in PL_utf8_foldclosures (including itself) */
4410 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4411 if (! PL_utf8_foldclosures) {
4412 _load_PL_utf8_foldclosures();
4414 if ((listp = hv_fetch(PL_utf8_foldclosures,
4415 (char *) s, foldlen, FALSE)))
4417 AV* list = (AV*) *listp;
4419 for (k = 0; k <= av_tindex(list); k++) {
4420 SV** c_p = av_fetch(list, k, FALSE);
4426 /* /aa doesn't allow folds between ASCII and non- */
4427 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4428 && isASCII(c) != isASCII(uc))
4433 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4438 if (flags & SCF_DO_STCLASS_AND) {
4439 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4440 ANYOF_POSIXL_ZERO(data->start_class);
4441 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4443 else if (flags & SCF_DO_STCLASS_OR) {
4444 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4445 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4447 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4448 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4450 flags &= ~SCF_DO_STCLASS;
4451 SvREFCNT_dec(EXACTF_invlist);
4453 else if (REGNODE_VARIES(OP(scan))) {
4454 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4455 I32 fl = 0, f = flags;
4456 regnode * const oscan = scan;
4457 regnode_ssc this_class;
4458 regnode_ssc *oclass = NULL;
4459 I32 next_is_eval = 0;
4461 switch (PL_regkind[OP(scan)]) {
4462 case WHILEM: /* End of (?:...)* . */
4463 scan = NEXTOPER(scan);
4466 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4467 next = NEXTOPER(scan);
4468 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4470 maxcount = REG_INFTY;
4471 next = regnext(scan);
4472 scan = NEXTOPER(scan);
4476 if (flags & SCF_DO_SUBSTR)
4481 if (flags & SCF_DO_STCLASS) {
4483 maxcount = REG_INFTY;
4484 next = regnext(scan);
4485 scan = NEXTOPER(scan);
4488 if (flags & SCF_DO_SUBSTR) {
4489 scan_commit(pRExC_state, data, minlenp, is_inf);
4490 /* Cannot extend fixed substrings */
4491 data->longest = &(data->longest_float);
4493 is_inf = is_inf_internal = 1;
4494 scan = regnext(scan);
4495 goto optimize_curly_tail;
4497 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4498 && (scan->flags == stopparen))
4503 mincount = ARG1(scan);
4504 maxcount = ARG2(scan);
4506 next = regnext(scan);
4507 if (OP(scan) == CURLYX) {
4508 I32 lp = (data ? *(data->last_closep) : 0);
4509 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4511 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4512 next_is_eval = (OP(scan) == EVAL);
4514 if (flags & SCF_DO_SUBSTR) {
4516 scan_commit(pRExC_state, data, minlenp, is_inf);
4517 /* Cannot extend fixed substrings */
4518 pos_before = data->pos_min;
4522 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4524 data->flags |= SF_IS_INF;
4526 if (flags & SCF_DO_STCLASS) {
4527 ssc_init(pRExC_state, &this_class);
4528 oclass = data->start_class;
4529 data->start_class = &this_class;
4530 f |= SCF_DO_STCLASS_AND;
4531 f &= ~SCF_DO_STCLASS_OR;
4533 /* Exclude from super-linear cache processing any {n,m}
4534 regops for which the combination of input pos and regex
4535 pos is not enough information to determine if a match
4538 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4539 regex pos at the \s*, the prospects for a match depend not
4540 only on the input position but also on how many (bar\s*)
4541 repeats into the {4,8} we are. */
4542 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4543 f &= ~SCF_WHILEM_VISITED_POS;
4545 /* This will finish on WHILEM, setting scan, or on NULL: */
4546 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4547 last, data, stopparen, recursed_depth, NULL,
4549 ? (f & ~SCF_DO_SUBSTR)
4553 if (flags & SCF_DO_STCLASS)
4554 data->start_class = oclass;
4555 if (mincount == 0 || minnext == 0) {
4556 if (flags & SCF_DO_STCLASS_OR) {
4557 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4559 else if (flags & SCF_DO_STCLASS_AND) {
4560 /* Switch to OR mode: cache the old value of
4561 * data->start_class */
4563 StructCopy(data->start_class, and_withp, regnode_ssc);
4564 flags &= ~SCF_DO_STCLASS_AND;
4565 StructCopy(&this_class, data->start_class, regnode_ssc);
4566 flags |= SCF_DO_STCLASS_OR;
4567 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4569 } else { /* Non-zero len */
4570 if (flags & SCF_DO_STCLASS_OR) {
4571 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4572 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4574 else if (flags & SCF_DO_STCLASS_AND)
4575 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4576 flags &= ~SCF_DO_STCLASS;
4578 if (!scan) /* It was not CURLYX, but CURLY. */
4580 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4581 /* ? quantifier ok, except for (?{ ... }) */
4582 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4583 && (minnext == 0) && (deltanext == 0)
4584 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4585 && maxcount <= REG_INFTY/3) /* Complement check for big
4588 /* Fatal warnings may leak the regexp without this: */
4589 SAVEFREESV(RExC_rx_sv);
4590 ckWARNreg(RExC_parse,
4591 "Quantifier unexpected on zero-length expression");
4592 (void)ReREFCNT_inc(RExC_rx_sv);
4595 min += minnext * mincount;
4596 is_inf_internal |= deltanext == SSize_t_MAX
4597 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4598 is_inf |= is_inf_internal;
4600 delta = SSize_t_MAX;
4602 delta += (minnext + deltanext) * maxcount
4603 - minnext * mincount;
4605 /* Try powerful optimization CURLYX => CURLYN. */
4606 if ( OP(oscan) == CURLYX && data
4607 && data->flags & SF_IN_PAR
4608 && !(data->flags & SF_HAS_EVAL)
4609 && !deltanext && minnext == 1 ) {
4610 /* Try to optimize to CURLYN. */
4611 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4612 regnode * const nxt1 = nxt;
4619 if (!REGNODE_SIMPLE(OP(nxt))
4620 && !(PL_regkind[OP(nxt)] == EXACT
4621 && STR_LEN(nxt) == 1))
4627 if (OP(nxt) != CLOSE)
4629 if (RExC_open_parens) {
4630 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4631 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4633 /* Now we know that nxt2 is the only contents: */
4634 oscan->flags = (U8)ARG(nxt);
4636 OP(nxt1) = NOTHING; /* was OPEN. */
4639 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4640 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4641 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4642 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4643 OP(nxt + 1) = OPTIMIZED; /* was count. */
4644 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4649 /* Try optimization CURLYX => CURLYM. */
4650 if ( OP(oscan) == CURLYX && data
4651 && !(data->flags & SF_HAS_PAR)
4652 && !(data->flags & SF_HAS_EVAL)
4653 && !deltanext /* atom is fixed width */
4654 && minnext != 0 /* CURLYM can't handle zero width */
4656 /* Nor characters whose fold at run-time may be
4657 * multi-character */
4658 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4660 /* XXXX How to optimize if data == 0? */
4661 /* Optimize to a simpler form. */
4662 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4666 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4667 && (OP(nxt2) != WHILEM))
4669 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4670 /* Need to optimize away parenths. */
4671 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4672 /* Set the parenth number. */
4673 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4675 oscan->flags = (U8)ARG(nxt);
4676 if (RExC_open_parens) {
4677 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4678 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4680 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4681 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4684 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4685 OP(nxt + 1) = OPTIMIZED; /* was count. */
4686 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4687 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4690 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4691 regnode *nnxt = regnext(nxt1);
4693 if (reg_off_by_arg[OP(nxt1)])
4694 ARG_SET(nxt1, nxt2 - nxt1);
4695 else if (nxt2 - nxt1 < U16_MAX)
4696 NEXT_OFF(nxt1) = nxt2 - nxt1;
4698 OP(nxt) = NOTHING; /* Cannot beautify */
4703 /* Optimize again: */
4704 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4705 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4710 else if ((OP(oscan) == CURLYX)
4711 && (flags & SCF_WHILEM_VISITED_POS)
4712 /* See the comment on a similar expression above.
4713 However, this time it's not a subexpression
4714 we care about, but the expression itself. */
4715 && (maxcount == REG_INFTY)
4716 && data && ++data->whilem_c < 16) {
4717 /* This stays as CURLYX, we can put the count/of pair. */
4718 /* Find WHILEM (as in regexec.c) */
4719 regnode *nxt = oscan + NEXT_OFF(oscan);
4721 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4723 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4724 | (RExC_whilem_seen << 4)); /* On WHILEM */
4726 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4728 if (flags & SCF_DO_SUBSTR) {
4729 SV *last_str = NULL;
4730 STRLEN last_chrs = 0;
4731 int counted = mincount != 0;
4733 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4735 SSize_t b = pos_before >= data->last_start_min
4736 ? pos_before : data->last_start_min;
4738 const char * const s = SvPV_const(data->last_found, l);
4739 SSize_t old = b - data->last_start_min;
4742 old = utf8_hop((U8*)s, old) - (U8*)s;
4744 /* Get the added string: */
4745 last_str = newSVpvn_utf8(s + old, l, UTF);
4746 last_chrs = UTF ? utf8_length((U8*)(s + old),
4747 (U8*)(s + old + l)) : l;
4748 if (deltanext == 0 && pos_before == b) {
4749 /* What was added is a constant string */
4752 SvGROW(last_str, (mincount * l) + 1);
4753 repeatcpy(SvPVX(last_str) + l,
4754 SvPVX_const(last_str), l,
4756 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4757 /* Add additional parts. */
4758 SvCUR_set(data->last_found,
4759 SvCUR(data->last_found) - l);
4760 sv_catsv(data->last_found, last_str);
4762 SV * sv = data->last_found;
4764 SvUTF8(sv) && SvMAGICAL(sv) ?
4765 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4766 if (mg && mg->mg_len >= 0)
4767 mg->mg_len += last_chrs * (mincount-1);
4769 last_chrs *= mincount;
4770 data->last_end += l * (mincount - 1);
4773 /* start offset must point into the last copy */
4774 data->last_start_min += minnext * (mincount - 1);
4775 data->last_start_max += is_inf ? SSize_t_MAX
4776 : (maxcount - 1) * (minnext + data->pos_delta);
4779 /* It is counted once already... */
4780 data->pos_min += minnext * (mincount - counted);
4782 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4783 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4784 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4785 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4787 if (deltanext != SSize_t_MAX)
4788 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4789 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4790 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4792 if (deltanext == SSize_t_MAX
4793 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4794 data->pos_delta = SSize_t_MAX;
4796 data->pos_delta += - counted * deltanext +
4797 (minnext + deltanext) * maxcount - minnext * mincount;
4798 if (mincount != maxcount) {
4799 /* Cannot extend fixed substrings found inside
4801 scan_commit(pRExC_state, data, minlenp, is_inf);
4802 if (mincount && last_str) {
4803 SV * const sv = data->last_found;
4804 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4805 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4809 sv_setsv(sv, last_str);
4810 data->last_end = data->pos_min;
4811 data->last_start_min = data->pos_min - last_chrs;
4812 data->last_start_max = is_inf
4814 : data->pos_min + data->pos_delta - last_chrs;
4816 data->longest = &(data->longest_float);
4818 SvREFCNT_dec(last_str);
4820 if (data && (fl & SF_HAS_EVAL))
4821 data->flags |= SF_HAS_EVAL;
4822 optimize_curly_tail:
4823 if (OP(oscan) != CURLYX) {
4824 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4826 NEXT_OFF(oscan) += NEXT_OFF(next);
4832 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4837 if (flags & SCF_DO_SUBSTR) {
4838 /* Cannot expect anything... */
4839 scan_commit(pRExC_state, data, minlenp, is_inf);
4840 data->longest = &(data->longest_float);
4842 is_inf = is_inf_internal = 1;
4843 if (flags & SCF_DO_STCLASS_OR) {
4844 if (OP(scan) == CLUMP) {
4845 /* Actually is any start char, but very few code points
4846 * aren't start characters */
4847 ssc_match_all_cp(data->start_class);
4850 ssc_anything(data->start_class);
4853 flags &= ~SCF_DO_STCLASS;
4857 else if (OP(scan) == LNBREAK) {
4858 if (flags & SCF_DO_STCLASS) {
4859 if (flags & SCF_DO_STCLASS_AND) {
4860 ssc_intersection(data->start_class,
4861 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4862 ssc_clear_locale(data->start_class);
4863 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4865 else if (flags & SCF_DO_STCLASS_OR) {
4866 ssc_union(data->start_class,
4867 PL_XPosix_ptrs[_CC_VERTSPACE],
4869 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4871 /* See commit msg for
4872 * 749e076fceedeb708a624933726e7989f2302f6a */
4873 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4875 flags &= ~SCF_DO_STCLASS;
4878 delta++; /* Because of the 2 char string cr-lf */
4879 if (flags & SCF_DO_SUBSTR) {
4880 /* Cannot expect anything... */
4881 scan_commit(pRExC_state, data, minlenp, is_inf);
4883 data->pos_delta += 1;
4884 data->longest = &(data->longest_float);
4887 else if (REGNODE_SIMPLE(OP(scan))) {
4889 if (flags & SCF_DO_SUBSTR) {
4890 scan_commit(pRExC_state, data, minlenp, is_inf);
4894 if (flags & SCF_DO_STCLASS) {
4896 SV* my_invlist = sv_2mortal(_new_invlist(0));
4899 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4900 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4902 /* Some of the logic below assumes that switching
4903 locale on will only add false positives. */
4908 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4913 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4914 ssc_match_all_cp(data->start_class);
4919 SV* REG_ANY_invlist = _new_invlist(2);
4920 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4922 if (flags & SCF_DO_STCLASS_OR) {
4923 ssc_union(data->start_class,
4925 TRUE /* TRUE => invert, hence all but \n
4929 else if (flags & SCF_DO_STCLASS_AND) {
4930 ssc_intersection(data->start_class,
4932 TRUE /* TRUE => invert */
4934 ssc_clear_locale(data->start_class);
4936 SvREFCNT_dec_NN(REG_ANY_invlist);
4941 if (flags & SCF_DO_STCLASS_AND)
4942 ssc_and(pRExC_state, data->start_class,
4943 (regnode_charclass *) scan);
4945 ssc_or(pRExC_state, data->start_class,
4946 (regnode_charclass *) scan);
4954 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4955 if (flags & SCF_DO_STCLASS_AND) {
4956 bool was_there = cBOOL(
4957 ANYOF_POSIXL_TEST(data->start_class,
4959 ANYOF_POSIXL_ZERO(data->start_class);
4960 if (was_there) { /* Do an AND */
4961 ANYOF_POSIXL_SET(data->start_class, namedclass);
4963 /* No individual code points can now match */
4964 data->start_class->invlist
4965 = sv_2mortal(_new_invlist(0));
4968 int complement = namedclass + ((invert) ? -1 : 1);
4970 assert(flags & SCF_DO_STCLASS_OR);
4972 /* If the complement of this class was already there,
4973 * the result is that they match all code points,
4974 * (\d + \D == everything). Remove the classes from
4975 * future consideration. Locale is not relevant in
4977 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4978 ssc_match_all_cp(data->start_class);
4979 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4980 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4982 else { /* The usual case; just add this class to the
4984 ANYOF_POSIXL_SET(data->start_class, namedclass);
4989 case NPOSIXA: /* For these, we always know the exact set of
4994 if (FLAGS(scan) == _CC_ASCII) {
4995 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4998 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
4999 PL_XPosix_ptrs[_CC_ASCII],
5010 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5012 /* NPOSIXD matches all upper Latin1 code points unless the
5013 * target string being matched is UTF-8, which is
5014 * unknowable until match time. Since we are going to
5015 * invert, we want to get rid of all of them so that the
5016 * inversion will match all */
5017 if (OP(scan) == NPOSIXD) {
5018 _invlist_subtract(my_invlist, PL_UpperLatin1,
5024 if (flags & SCF_DO_STCLASS_AND) {
5025 ssc_intersection(data->start_class, my_invlist, invert);
5026 ssc_clear_locale(data->start_class);
5029 assert(flags & SCF_DO_STCLASS_OR);
5030 ssc_union(data->start_class, my_invlist, invert);
5033 if (flags & SCF_DO_STCLASS_OR)
5034 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5035 flags &= ~SCF_DO_STCLASS;
5038 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5039 data->flags |= (OP(scan) == MEOL
5042 scan_commit(pRExC_state, data, minlenp, is_inf);
5045 else if ( PL_regkind[OP(scan)] == BRANCHJ
5046 /* Lookbehind, or need to calculate parens/evals/stclass: */
5047 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5048 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5049 if ( OP(scan) == UNLESSM &&
5051 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5052 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5055 regnode *upto= regnext(scan);
5057 SV * const mysv_val=sv_newmortal();
5058 DEBUG_STUDYDATA("OPFAIL",data,depth);
5060 /*DEBUG_PARSE_MSG("opfail");*/
5061 regprop(RExC_rx, mysv_val, upto, NULL);
5062 PerlIO_printf(Perl_debug_log,
5063 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5064 SvPV_nolen_const(mysv_val),
5065 (IV)REG_NODE_NUM(upto),
5070 NEXT_OFF(scan) = upto - scan;
5071 for (opt= scan + 1; opt < upto ; opt++)
5072 OP(opt) = OPTIMIZED;
5076 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5077 || OP(scan) == UNLESSM )
5079 /* Negative Lookahead/lookbehind
5080 In this case we can't do fixed string optimisation.
5083 SSize_t deltanext, minnext, fake = 0;
5088 data_fake.flags = 0;
5090 data_fake.whilem_c = data->whilem_c;
5091 data_fake.last_closep = data->last_closep;
5094 data_fake.last_closep = &fake;
5095 data_fake.pos_delta = delta;
5096 if ( flags & SCF_DO_STCLASS && !scan->flags
5097 && OP(scan) == IFMATCH ) { /* Lookahead */
5098 ssc_init(pRExC_state, &intrnl);
5099 data_fake.start_class = &intrnl;
5100 f |= SCF_DO_STCLASS_AND;
5102 if (flags & SCF_WHILEM_VISITED_POS)
5103 f |= SCF_WHILEM_VISITED_POS;
5104 next = regnext(scan);
5105 nscan = NEXTOPER(NEXTOPER(scan));
5106 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5107 last, &data_fake, stopparen,
5108 recursed_depth, NULL, f, depth+1);
5111 FAIL("Variable length lookbehind not implemented");
5113 else if (minnext > (I32)U8_MAX) {
5114 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5117 scan->flags = (U8)minnext;
5120 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5122 if (data_fake.flags & SF_HAS_EVAL)
5123 data->flags |= SF_HAS_EVAL;
5124 data->whilem_c = data_fake.whilem_c;
5126 if (f & SCF_DO_STCLASS_AND) {
5127 if (flags & SCF_DO_STCLASS_OR) {
5128 /* OR before, AND after: ideally we would recurse with
5129 * data_fake to get the AND applied by study of the
5130 * remainder of the pattern, and then derecurse;
5131 * *** HACK *** for now just treat as "no information".
5132 * See [perl #56690].
5134 ssc_init(pRExC_state, data->start_class);
5136 /* AND before and after: combine and continue */
5137 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5141 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5143 /* Positive Lookahead/lookbehind
5144 In this case we can do fixed string optimisation,
5145 but we must be careful about it. Note in the case of
5146 lookbehind the positions will be offset by the minimum
5147 length of the pattern, something we won't know about
5148 until after the recurse.
5150 SSize_t deltanext, fake = 0;
5154 /* We use SAVEFREEPV so that when the full compile
5155 is finished perl will clean up the allocated
5156 minlens when it's all done. This way we don't
5157 have to worry about freeing them when we know
5158 they wont be used, which would be a pain.
5161 Newx( minnextp, 1, SSize_t );
5162 SAVEFREEPV(minnextp);
5165 StructCopy(data, &data_fake, scan_data_t);
5166 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5169 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5170 data_fake.last_found=newSVsv(data->last_found);
5174 data_fake.last_closep = &fake;
5175 data_fake.flags = 0;
5176 data_fake.pos_delta = delta;
5178 data_fake.flags |= SF_IS_INF;
5179 if ( flags & SCF_DO_STCLASS && !scan->flags
5180 && OP(scan) == IFMATCH ) { /* Lookahead */
5181 ssc_init(pRExC_state, &intrnl);
5182 data_fake.start_class = &intrnl;
5183 f |= SCF_DO_STCLASS_AND;
5185 if (flags & SCF_WHILEM_VISITED_POS)
5186 f |= SCF_WHILEM_VISITED_POS;
5187 next = regnext(scan);
5188 nscan = NEXTOPER(NEXTOPER(scan));
5190 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5191 &deltanext, last, &data_fake,
5192 stopparen, recursed_depth, NULL,
5196 FAIL("Variable length lookbehind not implemented");
5198 else if (*minnextp > (I32)U8_MAX) {
5199 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5202 scan->flags = (U8)*minnextp;
5207 if (f & SCF_DO_STCLASS_AND) {
5208 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5211 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5213 if (data_fake.flags & SF_HAS_EVAL)
5214 data->flags |= SF_HAS_EVAL;
5215 data->whilem_c = data_fake.whilem_c;
5216 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5217 if (RExC_rx->minlen<*minnextp)
5218 RExC_rx->minlen=*minnextp;
5219 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5220 SvREFCNT_dec_NN(data_fake.last_found);
5222 if ( data_fake.minlen_fixed != minlenp )
5224 data->offset_fixed= data_fake.offset_fixed;
5225 data->minlen_fixed= data_fake.minlen_fixed;
5226 data->lookbehind_fixed+= scan->flags;
5228 if ( data_fake.minlen_float != minlenp )
5230 data->minlen_float= data_fake.minlen_float;
5231 data->offset_float_min=data_fake.offset_float_min;
5232 data->offset_float_max=data_fake.offset_float_max;
5233 data->lookbehind_float+= scan->flags;
5240 else if (OP(scan) == OPEN) {
5241 if (stopparen != (I32)ARG(scan))
5244 else if (OP(scan) == CLOSE) {
5245 if (stopparen == (I32)ARG(scan)) {
5248 if ((I32)ARG(scan) == is_par) {
5249 next = regnext(scan);
5251 if ( next && (OP(next) != WHILEM) && next < last)
5252 is_par = 0; /* Disable optimization */
5255 *(data->last_closep) = ARG(scan);
5257 else if (OP(scan) == EVAL) {
5259 data->flags |= SF_HAS_EVAL;
5261 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5262 if (flags & SCF_DO_SUBSTR) {
5263 scan_commit(pRExC_state, data, minlenp, is_inf);
5264 flags &= ~SCF_DO_SUBSTR;
5266 if (data && OP(scan)==ACCEPT) {
5267 data->flags |= SCF_SEEN_ACCEPT;
5272 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5274 if (flags & SCF_DO_SUBSTR) {
5275 scan_commit(pRExC_state, data, minlenp, is_inf);
5276 data->longest = &(data->longest_float);
5278 is_inf = is_inf_internal = 1;
5279 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5280 ssc_anything(data->start_class);
5281 flags &= ~SCF_DO_STCLASS;
5283 else if (OP(scan) == GPOS) {
5284 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5285 !(delta || is_inf || (data && data->pos_delta)))
5287 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5288 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5289 if (RExC_rx->gofs < (STRLEN)min)
5290 RExC_rx->gofs = min;
5292 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5296 #ifdef TRIE_STUDY_OPT
5297 #ifdef FULL_TRIE_STUDY
5298 else if (PL_regkind[OP(scan)] == TRIE) {
5299 /* NOTE - There is similar code to this block above for handling
5300 BRANCH nodes on the initial study. If you change stuff here
5302 regnode *trie_node= scan;
5303 regnode *tail= regnext(scan);
5304 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5305 SSize_t max1 = 0, min1 = SSize_t_MAX;
5308 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5309 /* Cannot merge strings after this. */
5310 scan_commit(pRExC_state, data, minlenp, is_inf);
5312 if (flags & SCF_DO_STCLASS)
5313 ssc_init_zero(pRExC_state, &accum);
5319 const regnode *nextbranch= NULL;
5322 for ( word=1 ; word <= trie->wordcount ; word++)
5324 SSize_t deltanext=0, minnext=0, f = 0, fake;
5325 regnode_ssc this_class;
5327 data_fake.flags = 0;
5329 data_fake.whilem_c = data->whilem_c;
5330 data_fake.last_closep = data->last_closep;
5333 data_fake.last_closep = &fake;
5334 data_fake.pos_delta = delta;
5335 if (flags & SCF_DO_STCLASS) {
5336 ssc_init(pRExC_state, &this_class);
5337 data_fake.start_class = &this_class;
5338 f = SCF_DO_STCLASS_AND;
5340 if (flags & SCF_WHILEM_VISITED_POS)
5341 f |= SCF_WHILEM_VISITED_POS;
5343 if (trie->jump[word]) {
5345 nextbranch = trie_node + trie->jump[0];
5346 scan= trie_node + trie->jump[word];
5347 /* We go from the jump point to the branch that follows
5348 it. Note this means we need the vestigal unused
5349 branches even though they arent otherwise used. */
5350 minnext = study_chunk(pRExC_state, &scan, minlenp,
5351 &deltanext, (regnode *)nextbranch, &data_fake,
5352 stopparen, recursed_depth, NULL, f,depth+1);
5354 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5355 nextbranch= regnext((regnode*)nextbranch);
5357 if (min1 > (SSize_t)(minnext + trie->minlen))
5358 min1 = minnext + trie->minlen;
5359 if (deltanext == SSize_t_MAX) {
5360 is_inf = is_inf_internal = 1;
5362 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5363 max1 = minnext + deltanext + trie->maxlen;
5365 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5367 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5368 if ( stopmin > min + min1)
5369 stopmin = min + min1;
5370 flags &= ~SCF_DO_SUBSTR;
5372 data->flags |= SCF_SEEN_ACCEPT;
5375 if (data_fake.flags & SF_HAS_EVAL)
5376 data->flags |= SF_HAS_EVAL;
5377 data->whilem_c = data_fake.whilem_c;
5379 if (flags & SCF_DO_STCLASS)
5380 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5383 if (flags & SCF_DO_SUBSTR) {
5384 data->pos_min += min1;
5385 data->pos_delta += max1 - min1;
5386 if (max1 != min1 || is_inf)
5387 data->longest = &(data->longest_float);
5390 delta += max1 - min1;
5391 if (flags & SCF_DO_STCLASS_OR) {
5392 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5394 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5395 flags &= ~SCF_DO_STCLASS;
5398 else if (flags & SCF_DO_STCLASS_AND) {
5400 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5401 flags &= ~SCF_DO_STCLASS;
5404 /* Switch to OR mode: cache the old value of
5405 * data->start_class */
5407 StructCopy(data->start_class, and_withp, regnode_ssc);
5408 flags &= ~SCF_DO_STCLASS_AND;
5409 StructCopy(&accum, data->start_class, regnode_ssc);
5410 flags |= SCF_DO_STCLASS_OR;
5417 else if (PL_regkind[OP(scan)] == TRIE) {
5418 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5421 min += trie->minlen;
5422 delta += (trie->maxlen - trie->minlen);
5423 flags &= ~SCF_DO_STCLASS; /* xxx */
5424 if (flags & SCF_DO_SUBSTR) {
5425 /* Cannot expect anything... */
5426 scan_commit(pRExC_state, data, minlenp, is_inf);
5427 data->pos_min += trie->minlen;
5428 data->pos_delta += (trie->maxlen - trie->minlen);
5429 if (trie->maxlen != trie->minlen)
5430 data->longest = &(data->longest_float);
5432 if (trie->jump) /* no more substrings -- for now /grr*/
5433 flags &= ~SCF_DO_SUBSTR;
5435 #endif /* old or new */
5436 #endif /* TRIE_STUDY_OPT */
5438 /* Else: zero-length, ignore. */
5439 scan = regnext(scan);
5441 /* If we are exiting a recursion we can unset its recursed bit
5442 * and allow ourselves to enter it again - no danger of an
5443 * infinite loop there.
5444 if (stopparen > -1 && recursed) {
5445 DEBUG_STUDYDATA("unset:", data,depth);
5446 PAREN_UNSET( recursed, stopparen);
5450 DEBUG_STUDYDATA("frame-end:",data,depth);
5451 DEBUG_PEEP("fend", scan, depth);
5452 /* restore previous context */
5455 stopparen = frame->stop;
5456 recursed_depth = frame->prev_recursed_depth;
5459 frame = frame->prev;
5460 goto fake_study_recurse;
5465 DEBUG_STUDYDATA("pre-fin:",data,depth);
5468 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5470 if (flags & SCF_DO_SUBSTR && is_inf)
5471 data->pos_delta = SSize_t_MAX - data->pos_min;
5472 if (is_par > (I32)U8_MAX)
5474 if (is_par && pars==1 && data) {
5475 data->flags |= SF_IN_PAR;
5476 data->flags &= ~SF_HAS_PAR;
5478 else if (pars && data) {
5479 data->flags |= SF_HAS_PAR;
5480 data->flags &= ~SF_IN_PAR;
5482 if (flags & SCF_DO_STCLASS_OR)
5483 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5484 if (flags & SCF_TRIE_RESTUDY)
5485 data->flags |= SCF_TRIE_RESTUDY;
5487 DEBUG_STUDYDATA("post-fin:",data,depth);
5490 SSize_t final_minlen= min < stopmin ? min : stopmin;
5492 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5493 RExC_maxlen = final_minlen + delta;
5495 return final_minlen;
5501 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5503 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5505 PERL_ARGS_ASSERT_ADD_DATA;
5507 Renewc(RExC_rxi->data,
5508 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5509 char, struct reg_data);
5511 Renew(RExC_rxi->data->what, count + n, U8);
5513 Newx(RExC_rxi->data->what, n, U8);
5514 RExC_rxi->data->count = count + n;
5515 Copy(s, RExC_rxi->data->what + count, n, U8);
5519 /*XXX: todo make this not included in a non debugging perl, but appears to be
5520 * used anyway there, in 'use re' */
5521 #ifndef PERL_IN_XSUB_RE
5523 Perl_reginitcolors(pTHX)
5526 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5528 char *t = savepv(s);
5532 t = strchr(t, '\t');
5538 PL_colors[i] = t = (char *)"";
5543 PL_colors[i++] = (char *)"";
5550 #ifdef TRIE_STUDY_OPT
5551 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5554 (data.flags & SCF_TRIE_RESTUDY) \
5562 #define CHECK_RESTUDY_GOTO_butfirst
5566 * pregcomp - compile a regular expression into internal code
5568 * Decides which engine's compiler to call based on the hint currently in
5572 #ifndef PERL_IN_XSUB_RE
5574 /* return the currently in-scope regex engine (or the default if none) */
5576 regexp_engine const *
5577 Perl_current_re_engine(pTHX)
5581 if (IN_PERL_COMPILETIME) {
5582 HV * const table = GvHV(PL_hintgv);
5585 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5586 return &PL_core_reg_engine;
5587 ptr = hv_fetchs(table, "regcomp", FALSE);
5588 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5589 return &PL_core_reg_engine;
5590 return INT2PTR(regexp_engine*,SvIV(*ptr));
5594 if (!PL_curcop->cop_hints_hash)
5595 return &PL_core_reg_engine;
5596 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5597 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5598 return &PL_core_reg_engine;
5599 return INT2PTR(regexp_engine*,SvIV(ptr));
5605 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5608 regexp_engine const *eng = current_re_engine();
5609 GET_RE_DEBUG_FLAGS_DECL;
5611 PERL_ARGS_ASSERT_PREGCOMP;
5613 /* Dispatch a request to compile a regexp to correct regexp engine. */
5615 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5618 return CALLREGCOMP_ENG(eng, pattern, flags);
5622 /* public(ish) entry point for the perl core's own regex compiling code.
5623 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5624 * pattern rather than a list of OPs, and uses the internal engine rather
5625 * than the current one */
5628 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5630 SV *pat = pattern; /* defeat constness! */
5631 PERL_ARGS_ASSERT_RE_COMPILE;
5632 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5633 #ifdef PERL_IN_XSUB_RE
5636 &PL_core_reg_engine,
5638 NULL, NULL, rx_flags, 0);
5642 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5643 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5644 * point to the realloced string and length.
5646 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5650 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5651 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5653 U8 *const src = (U8*)*pat_p;
5656 STRLEN s = 0, d = 0;
5658 GET_RE_DEBUG_FLAGS_DECL;
5660 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5661 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5663 Newx(dst, *plen_p * 2 + 1, U8);
5665 while (s < *plen_p) {
5666 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5669 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5670 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5672 if (n < num_code_blocks) {
5673 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5674 pRExC_state->code_blocks[n].start = d;
5675 assert(dst[d] == '(');
5678 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5679 pRExC_state->code_blocks[n].end = d;
5680 assert(dst[d] == ')');
5690 *pat_p = (char*) dst;
5692 RExC_orig_utf8 = RExC_utf8 = 1;
5697 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5698 * while recording any code block indices, and handling overloading,
5699 * nested qr// objects etc. If pat is null, it will allocate a new
5700 * string, or just return the first arg, if there's only one.
5702 * Returns the malloced/updated pat.
5703 * patternp and pat_count is the array of SVs to be concatted;
5704 * oplist is the optional list of ops that generated the SVs;
5705 * recompile_p is a pointer to a boolean that will be set if
5706 * the regex will need to be recompiled.
5707 * delim, if non-null is an SV that will be inserted between each element
5711 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5712 SV *pat, SV ** const patternp, int pat_count,
5713 OP *oplist, bool *recompile_p, SV *delim)
5717 bool use_delim = FALSE;
5718 bool alloced = FALSE;
5720 /* if we know we have at least two args, create an empty string,
5721 * then concatenate args to that. For no args, return an empty string */
5722 if (!pat && pat_count != 1) {
5728 for (svp = patternp; svp < patternp + pat_count; svp++) {
5731 STRLEN orig_patlen = 0;
5733 SV *msv = use_delim ? delim : *svp;
5734 if (!msv) msv = &PL_sv_undef;
5736 /* if we've got a delimiter, we go round the loop twice for each
5737 * svp slot (except the last), using the delimiter the second
5746 if (SvTYPE(msv) == SVt_PVAV) {
5747 /* we've encountered an interpolated array within
5748 * the pattern, e.g. /...@a..../. Expand the list of elements,
5749 * then recursively append elements.
5750 * The code in this block is based on S_pushav() */
5752 AV *const av = (AV*)msv;
5753 const SSize_t maxarg = AvFILL(av) + 1;
5757 assert(oplist->op_type == OP_PADAV
5758 || oplist->op_type == OP_RV2AV);
5759 oplist = oplist->op_sibling;;
5762 if (SvRMAGICAL(av)) {
5765 Newx(array, maxarg, SV*);
5767 for (i=0; i < maxarg; i++) {
5768 SV ** const svp = av_fetch(av, i, FALSE);
5769 array[i] = svp ? *svp : &PL_sv_undef;
5773 array = AvARRAY(av);
5775 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5776 array, maxarg, NULL, recompile_p,
5778 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5784 /* we make the assumption here that each op in the list of
5785 * op_siblings maps to one SV pushed onto the stack,
5786 * except for code blocks, with have both an OP_NULL and
5788 * This allows us to match up the list of SVs against the
5789 * list of OPs to find the next code block.
5791 * Note that PUSHMARK PADSV PADSV ..
5793 * PADRANGE PADSV PADSV ..
5794 * so the alignment still works. */
5797 if (oplist->op_type == OP_NULL
5798 && (oplist->op_flags & OPf_SPECIAL))
5800 assert(n < pRExC_state->num_code_blocks);
5801 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5802 pRExC_state->code_blocks[n].block = oplist;
5803 pRExC_state->code_blocks[n].src_regex = NULL;
5806 oplist = oplist->op_sibling; /* skip CONST */
5809 oplist = oplist->op_sibling;;
5812 /* apply magic and QR overloading to arg */
5815 if (SvROK(msv) && SvAMAGIC(msv)) {
5816 SV *sv = AMG_CALLunary(msv, regexp_amg);
5820 if (SvTYPE(sv) != SVt_REGEXP)
5821 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5826 /* try concatenation overload ... */
5827 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5828 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5831 /* overloading involved: all bets are off over literal
5832 * code. Pretend we haven't seen it */
5833 pRExC_state->num_code_blocks -= n;
5837 /* ... or failing that, try "" overload */
5838 while (SvAMAGIC(msv)
5839 && (sv = AMG_CALLunary(msv, string_amg))
5843 && SvRV(msv) == SvRV(sv))
5848 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5852 /* this is a partially unrolled
5853 * sv_catsv_nomg(pat, msv);
5854 * that allows us to adjust code block indices if
5857 char *dst = SvPV_force_nomg(pat, dlen);
5859 if (SvUTF8(msv) && !SvUTF8(pat)) {
5860 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5861 sv_setpvn(pat, dst, dlen);
5864 sv_catsv_nomg(pat, msv);
5871 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5874 /* extract any code blocks within any embedded qr//'s */
5875 if (rx && SvTYPE(rx) == SVt_REGEXP
5876 && RX_ENGINE((REGEXP*)rx)->op_comp)
5879 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5880 if (ri->num_code_blocks) {
5882 /* the presence of an embedded qr// with code means
5883 * we should always recompile: the text of the
5884 * qr// may not have changed, but it may be a
5885 * different closure than last time */
5887 Renew(pRExC_state->code_blocks,
5888 pRExC_state->num_code_blocks + ri->num_code_blocks,
5889 struct reg_code_block);
5890 pRExC_state->num_code_blocks += ri->num_code_blocks;
5892 for (i=0; i < ri->num_code_blocks; i++) {
5893 struct reg_code_block *src, *dst;
5894 STRLEN offset = orig_patlen
5895 + ReANY((REGEXP *)rx)->pre_prefix;
5896 assert(n < pRExC_state->num_code_blocks);
5897 src = &ri->code_blocks[i];
5898 dst = &pRExC_state->code_blocks[n];
5899 dst->start = src->start + offset;
5900 dst->end = src->end + offset;
5901 dst->block = src->block;
5902 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5911 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5920 /* see if there are any run-time code blocks in the pattern.
5921 * False positives are allowed */
5924 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5925 char *pat, STRLEN plen)
5930 PERL_UNUSED_CONTEXT;
5932 for (s = 0; s < plen; s++) {
5933 if (n < pRExC_state->num_code_blocks
5934 && s == pRExC_state->code_blocks[n].start)
5936 s = pRExC_state->code_blocks[n].end;
5940 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5942 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5944 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5951 /* Handle run-time code blocks. We will already have compiled any direct
5952 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5953 * copy of it, but with any literal code blocks blanked out and
5954 * appropriate chars escaped; then feed it into
5956 * eval "qr'modified_pattern'"
5960 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5964 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5966 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5967 * and merge them with any code blocks of the original regexp.
5969 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5970 * instead, just save the qr and return FALSE; this tells our caller that
5971 * the original pattern needs upgrading to utf8.
5975 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5976 char *pat, STRLEN plen)
5980 GET_RE_DEBUG_FLAGS_DECL;
5982 if (pRExC_state->runtime_code_qr) {
5983 /* this is the second time we've been called; this should
5984 * only happen if the main pattern got upgraded to utf8
5985 * during compilation; re-use the qr we compiled first time
5986 * round (which should be utf8 too)
5988 qr = pRExC_state->runtime_code_qr;
5989 pRExC_state->runtime_code_qr = NULL;
5990 assert(RExC_utf8 && SvUTF8(qr));
5996 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6000 /* determine how many extra chars we need for ' and \ escaping */
6001 for (s = 0; s < plen; s++) {
6002 if (pat[s] == '\'' || pat[s] == '\\')
6006 Newx(newpat, newlen, char);
6008 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6010 for (s = 0; s < plen; s++) {
6011 if (n < pRExC_state->num_code_blocks
6012 && s == pRExC_state->code_blocks[n].start)
6014 /* blank out literal code block */
6015 assert(pat[s] == '(');
6016 while (s <= pRExC_state->code_blocks[n].end) {
6024 if (pat[s] == '\'' || pat[s] == '\\')
6029 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6033 PerlIO_printf(Perl_debug_log,
6034 "%sre-parsing pattern for runtime code:%s %s\n",
6035 PL_colors[4],PL_colors[5],newpat);
6038 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6044 PUSHSTACKi(PERLSI_REQUIRE);
6045 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6046 * parsing qr''; normally only q'' does this. It also alters
6048 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6049 SvREFCNT_dec_NN(sv);
6054 SV * const errsv = ERRSV;
6055 if (SvTRUE_NN(errsv))
6057 Safefree(pRExC_state->code_blocks);
6058 /* use croak_sv ? */
6059 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6062 assert(SvROK(qr_ref));
6064 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6065 /* the leaving below frees the tmp qr_ref.
6066 * Give qr a life of its own */
6074 if (!RExC_utf8 && SvUTF8(qr)) {
6075 /* first time through; the pattern got upgraded; save the
6076 * qr for the next time through */
6077 assert(!pRExC_state->runtime_code_qr);
6078 pRExC_state->runtime_code_qr = qr;
6083 /* extract any code blocks within the returned qr// */
6086 /* merge the main (r1) and run-time (r2) code blocks into one */
6088 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6089 struct reg_code_block *new_block, *dst;
6090 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6093 if (!r2->num_code_blocks) /* we guessed wrong */
6095 SvREFCNT_dec_NN(qr);
6100 r1->num_code_blocks + r2->num_code_blocks,
6101 struct reg_code_block);
6104 while ( i1 < r1->num_code_blocks
6105 || i2 < r2->num_code_blocks)
6107 struct reg_code_block *src;
6110 if (i1 == r1->num_code_blocks) {
6111 src = &r2->code_blocks[i2++];
6114 else if (i2 == r2->num_code_blocks)
6115 src = &r1->code_blocks[i1++];
6116 else if ( r1->code_blocks[i1].start
6117 < r2->code_blocks[i2].start)
6119 src = &r1->code_blocks[i1++];
6120 assert(src->end < r2->code_blocks[i2].start);
6123 assert( r1->code_blocks[i1].start
6124 > r2->code_blocks[i2].start);
6125 src = &r2->code_blocks[i2++];
6127 assert(src->end < r1->code_blocks[i1].start);
6130 assert(pat[src->start] == '(');
6131 assert(pat[src->end] == ')');
6132 dst->start = src->start;
6133 dst->end = src->end;
6134 dst->block = src->block;
6135 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6139 r1->num_code_blocks += r2->num_code_blocks;
6140 Safefree(r1->code_blocks);
6141 r1->code_blocks = new_block;
6144 SvREFCNT_dec_NN(qr);
6150 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6151 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6152 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6153 STRLEN longest_length, bool eol, bool meol)
6155 /* This is the common code for setting up the floating and fixed length
6156 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6157 * as to whether succeeded or not */
6162 if (! (longest_length
6163 || (eol /* Can't have SEOL and MULTI */
6164 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6166 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6167 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6172 /* copy the information about the longest from the reg_scan_data
6173 over to the program. */
6174 if (SvUTF8(sv_longest)) {
6175 *rx_utf8 = sv_longest;
6178 *rx_substr = sv_longest;
6181 /* end_shift is how many chars that must be matched that
6182 follow this item. We calculate it ahead of time as once the
6183 lookbehind offset is added in we lose the ability to correctly
6185 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6186 *rx_end_shift = ml - offset
6187 - longest_length + (SvTAIL(sv_longest) != 0)
6190 t = (eol/* Can't have SEOL and MULTI */
6191 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6192 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6198 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6199 * regular expression into internal code.
6200 * The pattern may be passed either as:
6201 * a list of SVs (patternp plus pat_count)
6202 * a list of OPs (expr)
6203 * If both are passed, the SV list is used, but the OP list indicates
6204 * which SVs are actually pre-compiled code blocks
6206 * The SVs in the list have magic and qr overloading applied to them (and
6207 * the list may be modified in-place with replacement SVs in the latter
6210 * If the pattern hasn't changed from old_re, then old_re will be
6213 * eng is the current engine. If that engine has an op_comp method, then
6214 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6215 * do the initial concatenation of arguments and pass on to the external
6218 * If is_bare_re is not null, set it to a boolean indicating whether the
6219 * arg list reduced (after overloading) to a single bare regex which has
6220 * been returned (i.e. /$qr/).
6222 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6224 * pm_flags contains the PMf_* flags, typically based on those from the
6225 * pm_flags field of the related PMOP. Currently we're only interested in
6226 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6228 * We can't allocate space until we know how big the compiled form will be,
6229 * but we can't compile it (and thus know how big it is) until we've got a
6230 * place to put the code. So we cheat: we compile it twice, once with code
6231 * generation turned off and size counting turned on, and once "for real".
6232 * This also means that we don't allocate space until we are sure that the
6233 * thing really will compile successfully, and we never have to move the
6234 * code and thus invalidate pointers into it. (Note that it has to be in
6235 * one piece because free() must be able to free it all.) [NB: not true in perl]
6237 * Beware that the optimization-preparation code in here knows about some
6238 * of the structure of the compiled regexp. [I'll say.]
6242 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6243 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6244 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6249 regexp_internal *ri;
6257 SV *code_blocksv = NULL;
6258 SV** new_patternp = patternp;
6260 /* these are all flags - maybe they should be turned
6261 * into a single int with different bit masks */
6262 I32 sawlookahead = 0;
6267 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6269 bool runtime_code = 0;
6271 RExC_state_t RExC_state;
6272 RExC_state_t * const pRExC_state = &RExC_state;
6273 #ifdef TRIE_STUDY_OPT
6275 RExC_state_t copyRExC_state;
6277 GET_RE_DEBUG_FLAGS_DECL;
6279 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6281 DEBUG_r(if (!PL_colorset) reginitcolors());
6283 #ifndef PERL_IN_XSUB_RE
6284 /* Initialize these here instead of as-needed, as is quick and avoids
6285 * having to test them each time otherwise */
6286 if (! PL_AboveLatin1) {
6287 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6288 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6289 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6290 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6291 PL_HasMultiCharFold =
6292 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6296 pRExC_state->code_blocks = NULL;
6297 pRExC_state->num_code_blocks = 0;
6300 *is_bare_re = FALSE;
6302 if (expr && (expr->op_type == OP_LIST ||
6303 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6304 /* allocate code_blocks if needed */
6308 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6309 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6310 ncode++; /* count of DO blocks */
6312 pRExC_state->num_code_blocks = ncode;
6313 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6318 /* compile-time pattern with just OP_CONSTs and DO blocks */
6323 /* find how many CONSTs there are */
6326 if (expr->op_type == OP_CONST)
6329 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6330 if (o->op_type == OP_CONST)
6334 /* fake up an SV array */
6336 assert(!new_patternp);
6337 Newx(new_patternp, n, SV*);
6338 SAVEFREEPV(new_patternp);
6342 if (expr->op_type == OP_CONST)
6343 new_patternp[n] = cSVOPx_sv(expr);
6345 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6346 if (o->op_type == OP_CONST)
6347 new_patternp[n++] = cSVOPo_sv;
6352 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6353 "Assembling pattern from %d elements%s\n", pat_count,
6354 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6356 /* set expr to the first arg op */
6358 if (pRExC_state->num_code_blocks
6359 && expr->op_type != OP_CONST)
6361 expr = cLISTOPx(expr)->op_first;
6362 assert( expr->op_type == OP_PUSHMARK
6363 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6364 || expr->op_type == OP_PADRANGE);
6365 expr = expr->op_sibling;
6368 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6369 expr, &recompile, NULL);
6371 /* handle bare (possibly after overloading) regex: foo =~ $re */
6376 if (SvTYPE(re) == SVt_REGEXP) {
6380 Safefree(pRExC_state->code_blocks);
6381 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6382 "Precompiled pattern%s\n",
6383 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6389 exp = SvPV_nomg(pat, plen);
6391 if (!eng->op_comp) {
6392 if ((SvUTF8(pat) && IN_BYTES)
6393 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6395 /* make a temporary copy; either to convert to bytes,
6396 * or to avoid repeating get-magic / overloaded stringify */
6397 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6398 (IN_BYTES ? 0 : SvUTF8(pat)));
6400 Safefree(pRExC_state->code_blocks);
6401 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6404 /* ignore the utf8ness if the pattern is 0 length */
6405 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6406 RExC_uni_semantics = 0;
6407 RExC_contains_locale = 0;
6408 RExC_contains_i = 0;
6409 pRExC_state->runtime_code_qr = NULL;
6412 SV *dsv= sv_newmortal();
6413 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6414 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6415 PL_colors[4],PL_colors[5],s);
6419 /* we jump here if we upgrade the pattern to utf8 and have to
6422 if ((pm_flags & PMf_USE_RE_EVAL)
6423 /* this second condition covers the non-regex literal case,
6424 * i.e. $foo =~ '(?{})'. */
6425 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6427 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6429 /* return old regex if pattern hasn't changed */
6430 /* XXX: note in the below we have to check the flags as well as the
6433 * Things get a touch tricky as we have to compare the utf8 flag
6434 * independently from the compile flags. */
6438 && !!RX_UTF8(old_re) == !!RExC_utf8
6439 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6440 && RX_PRECOMP(old_re)
6441 && RX_PRELEN(old_re) == plen
6442 && memEQ(RX_PRECOMP(old_re), exp, plen)
6443 && !runtime_code /* with runtime code, always recompile */ )
6445 Safefree(pRExC_state->code_blocks);
6449 rx_flags = orig_rx_flags;
6451 if (rx_flags & PMf_FOLD) {
6452 RExC_contains_i = 1;
6454 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6456 /* Set to use unicode semantics if the pattern is in utf8 and has the
6457 * 'depends' charset specified, as it means unicode when utf8 */
6458 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6462 RExC_flags = rx_flags;
6463 RExC_pm_flags = pm_flags;
6466 if (TAINTING_get && TAINT_get)
6467 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6469 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6470 /* whoops, we have a non-utf8 pattern, whilst run-time code
6471 * got compiled as utf8. Try again with a utf8 pattern */
6472 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6473 pRExC_state->num_code_blocks);
6474 goto redo_first_pass;
6477 assert(!pRExC_state->runtime_code_qr);
6483 RExC_in_lookbehind = 0;
6484 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6486 RExC_override_recoding = 0;
6487 RExC_in_multi_char_class = 0;
6489 /* First pass: determine size, legality. */
6492 RExC_end = exp + plen;
6497 RExC_emit = (regnode *) &RExC_emit_dummy;
6498 RExC_whilem_seen = 0;
6499 RExC_open_parens = NULL;
6500 RExC_close_parens = NULL;
6502 RExC_paren_names = NULL;
6504 RExC_paren_name_list = NULL;
6506 RExC_recurse = NULL;
6507 RExC_study_chunk_recursed = NULL;
6508 RExC_study_chunk_recursed_bytes= 0;
6509 RExC_recurse_count = 0;
6510 pRExC_state->code_index = 0;
6512 #if 0 /* REGC() is (currently) a NOP at the first pass.
6513 * Clever compilers notice this and complain. --jhi */
6514 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6517 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6519 RExC_lastparse=NULL;
6521 /* reg may croak on us, not giving us a chance to free
6522 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6523 need it to survive as long as the regexp (qr/(?{})/).
6524 We must check that code_blocksv is not already set, because we may
6525 have jumped back to restart the sizing pass. */
6526 if (pRExC_state->code_blocks && !code_blocksv) {
6527 code_blocksv = newSV_type(SVt_PV);
6528 SAVEFREESV(code_blocksv);
6529 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6530 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6532 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6533 /* It's possible to write a regexp in ascii that represents Unicode
6534 codepoints outside of the byte range, such as via \x{100}. If we
6535 detect such a sequence we have to convert the entire pattern to utf8
6536 and then recompile, as our sizing calculation will have been based
6537 on 1 byte == 1 character, but we will need to use utf8 to encode
6538 at least some part of the pattern, and therefore must convert the whole
6541 if (flags & RESTART_UTF8) {
6542 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6543 pRExC_state->num_code_blocks);
6544 goto redo_first_pass;
6546 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6549 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6552 PerlIO_printf(Perl_debug_log,
6553 "Required size %"IVdf" nodes\n"
6554 "Starting second pass (creation)\n",
6557 RExC_lastparse=NULL;
6560 /* The first pass could have found things that force Unicode semantics */
6561 if ((RExC_utf8 || RExC_uni_semantics)
6562 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6564 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6567 /* Small enough for pointer-storage convention?
6568 If extralen==0, this means that we will not need long jumps. */
6569 if (RExC_size >= 0x10000L && RExC_extralen)
6570 RExC_size += RExC_extralen;
6573 if (RExC_whilem_seen > 15)
6574 RExC_whilem_seen = 15;
6576 /* Allocate space and zero-initialize. Note, the two step process
6577 of zeroing when in debug mode, thus anything assigned has to
6578 happen after that */
6579 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6581 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6582 char, regexp_internal);
6583 if ( r == NULL || ri == NULL )
6584 FAIL("Regexp out of space");
6586 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6587 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6590 /* bulk initialize base fields with 0. */
6591 Zero(ri, sizeof(regexp_internal), char);
6594 /* non-zero initialization begins here */
6597 r->extflags = rx_flags;
6598 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6600 if (pm_flags & PMf_IS_QR) {
6601 ri->code_blocks = pRExC_state->code_blocks;
6602 ri->num_code_blocks = pRExC_state->num_code_blocks;
6607 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6608 if (pRExC_state->code_blocks[n].src_regex)
6609 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6610 SAVEFREEPV(pRExC_state->code_blocks);
6614 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6615 bool has_charset = (get_regex_charset(r->extflags)
6616 != REGEX_DEPENDS_CHARSET);
6618 /* The caret is output if there are any defaults: if not all the STD
6619 * flags are set, or if no character set specifier is needed */
6621 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6623 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6624 == REG_RUN_ON_COMMENT_SEEN);
6625 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6626 >> RXf_PMf_STD_PMMOD_SHIFT);
6627 const char *fptr = STD_PAT_MODS; /*"msix"*/
6629 /* Allocate for the worst case, which is all the std flags are turned
6630 * on. If more precision is desired, we could do a population count of
6631 * the flags set. This could be done with a small lookup table, or by
6632 * shifting, masking and adding, or even, when available, assembly
6633 * language for a machine-language population count.
6634 * We never output a minus, as all those are defaults, so are
6635 * covered by the caret */
6636 const STRLEN wraplen = plen + has_p + has_runon
6637 + has_default /* If needs a caret */
6639 /* If needs a character set specifier */
6640 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6641 + (sizeof(STD_PAT_MODS) - 1)
6642 + (sizeof("(?:)") - 1);
6644 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6645 r->xpv_len_u.xpvlenu_pv = p;
6647 SvFLAGS(rx) |= SVf_UTF8;
6650 /* If a default, cover it using the caret */
6652 *p++= DEFAULT_PAT_MOD;
6656 const char* const name = get_regex_charset_name(r->extflags, &len);
6657 Copy(name, p, len, char);
6661 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6664 while((ch = *fptr++)) {
6672 Copy(RExC_precomp, p, plen, char);
6673 assert ((RX_WRAPPED(rx) - p) < 16);
6674 r->pre_prefix = p - RX_WRAPPED(rx);
6680 SvCUR_set(rx, p - RX_WRAPPED(rx));
6684 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6686 /* setup various meta data about recursion, this all requires
6687 * RExC_npar to be correctly set, and a bit later on we clear it */
6688 if (RExC_seen & REG_RECURSE_SEEN) {
6689 Newxz(RExC_open_parens, RExC_npar,regnode *);
6690 SAVEFREEPV(RExC_open_parens);
6691 Newxz(RExC_close_parens,RExC_npar,regnode *);
6692 SAVEFREEPV(RExC_close_parens);
6694 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6695 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6696 * So its 1 if there are no parens. */
6697 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6698 ((RExC_npar & 0x07) != 0);
6699 Newx(RExC_study_chunk_recursed,
6700 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6701 SAVEFREEPV(RExC_study_chunk_recursed);
6704 /* Useful during FAIL. */
6705 #ifdef RE_TRACK_PATTERN_OFFSETS
6706 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6707 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6708 "%s %"UVuf" bytes for offset annotations.\n",
6709 ri->u.offsets ? "Got" : "Couldn't get",
6710 (UV)((2*RExC_size+1) * sizeof(U32))));
6712 SetProgLen(ri,RExC_size);
6717 /* Second pass: emit code. */
6718 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6719 RExC_pm_flags = pm_flags;
6721 RExC_end = exp + plen;
6724 RExC_emit_start = ri->program;
6725 RExC_emit = ri->program;
6726 RExC_emit_bound = ri->program + RExC_size + 1;
6727 pRExC_state->code_index = 0;
6729 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6730 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6732 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6734 /* XXXX To minimize changes to RE engine we always allocate
6735 3-units-long substrs field. */
6736 Newx(r->substrs, 1, struct reg_substr_data);
6737 if (RExC_recurse_count) {
6738 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6739 SAVEFREEPV(RExC_recurse);
6743 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6744 Zero(r->substrs, 1, struct reg_substr_data);
6745 if (RExC_study_chunk_recursed)
6746 Zero(RExC_study_chunk_recursed,
6747 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6749 #ifdef TRIE_STUDY_OPT
6751 StructCopy(&zero_scan_data, &data, scan_data_t);
6752 copyRExC_state = RExC_state;
6755 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6757 RExC_state = copyRExC_state;
6758 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6759 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6761 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6762 StructCopy(&zero_scan_data, &data, scan_data_t);
6765 StructCopy(&zero_scan_data, &data, scan_data_t);
6768 /* Dig out information for optimizations. */
6769 r->extflags = RExC_flags; /* was pm_op */
6770 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6773 SvUTF8_on(rx); /* Unicode in it? */
6774 ri->regstclass = NULL;
6775 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6776 r->intflags |= PREGf_NAUGHTY;
6777 scan = ri->program + 1; /* First BRANCH. */
6779 /* testing for BRANCH here tells us whether there is "must appear"
6780 data in the pattern. If there is then we can use it for optimisations */
6781 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6784 STRLEN longest_float_length, longest_fixed_length;
6785 regnode_ssc ch_class; /* pointed to by data */
6787 SSize_t last_close = 0; /* pointed to by data */
6788 regnode *first= scan;
6789 regnode *first_next= regnext(first);
6791 * Skip introductions and multiplicators >= 1
6792 * so that we can extract the 'meat' of the pattern that must
6793 * match in the large if() sequence following.
6794 * NOTE that EXACT is NOT covered here, as it is normally
6795 * picked up by the optimiser separately.
6797 * This is unfortunate as the optimiser isnt handling lookahead
6798 * properly currently.
6801 while ((OP(first) == OPEN && (sawopen = 1)) ||
6802 /* An OR of *one* alternative - should not happen now. */
6803 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6804 /* for now we can't handle lookbehind IFMATCH*/
6805 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6806 (OP(first) == PLUS) ||
6807 (OP(first) == MINMOD) ||
6808 /* An {n,m} with n>0 */
6809 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6810 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6813 * the only op that could be a regnode is PLUS, all the rest
6814 * will be regnode_1 or regnode_2.
6816 * (yves doesn't think this is true)
6818 if (OP(first) == PLUS)
6821 if (OP(first) == MINMOD)
6823 first += regarglen[OP(first)];
6825 first = NEXTOPER(first);
6826 first_next= regnext(first);
6829 /* Starting-point info. */
6831 DEBUG_PEEP("first:",first,0);
6832 /* Ignore EXACT as we deal with it later. */
6833 if (PL_regkind[OP(first)] == EXACT) {
6834 if (OP(first) == EXACT)
6835 NOOP; /* Empty, get anchored substr later. */
6837 ri->regstclass = first;
6840 else if (PL_regkind[OP(first)] == TRIE &&
6841 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6843 /* this can happen only on restudy */
6844 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6847 else if (REGNODE_SIMPLE(OP(first)))
6848 ri->regstclass = first;
6849 else if (PL_regkind[OP(first)] == BOUND ||
6850 PL_regkind[OP(first)] == NBOUND)
6851 ri->regstclass = first;
6852 else if (PL_regkind[OP(first)] == BOL) {
6853 r->intflags |= (OP(first) == MBOL
6855 : (OP(first) == SBOL
6858 first = NEXTOPER(first);
6861 else if (OP(first) == GPOS) {
6862 r->intflags |= PREGf_ANCH_GPOS;
6863 first = NEXTOPER(first);
6866 else if ((!sawopen || !RExC_sawback) &&
6867 (OP(first) == STAR &&
6868 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6869 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6871 /* turn .* into ^.* with an implied $*=1 */
6873 (OP(NEXTOPER(first)) == REG_ANY)
6876 r->intflags |= (type | PREGf_IMPLICIT);
6877 first = NEXTOPER(first);
6880 if (sawplus && !sawminmod && !sawlookahead
6881 && (!sawopen || !RExC_sawback)
6882 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6883 /* x+ must match at the 1st pos of run of x's */
6884 r->intflags |= PREGf_SKIP;
6886 /* Scan is after the zeroth branch, first is atomic matcher. */
6887 #ifdef TRIE_STUDY_OPT
6890 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6891 (IV)(first - scan + 1))
6895 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6896 (IV)(first - scan + 1))
6902 * If there's something expensive in the r.e., find the
6903 * longest literal string that must appear and make it the
6904 * regmust. Resolve ties in favor of later strings, since
6905 * the regstart check works with the beginning of the r.e.
6906 * and avoiding duplication strengthens checking. Not a
6907 * strong reason, but sufficient in the absence of others.
6908 * [Now we resolve ties in favor of the earlier string if
6909 * it happens that c_offset_min has been invalidated, since the
6910 * earlier string may buy us something the later one won't.]
6913 data.longest_fixed = newSVpvs("");
6914 data.longest_float = newSVpvs("");
6915 data.last_found = newSVpvs("");
6916 data.longest = &(data.longest_fixed);
6917 ENTER_with_name("study_chunk");
6918 SAVEFREESV(data.longest_fixed);
6919 SAVEFREESV(data.longest_float);
6920 SAVEFREESV(data.last_found);
6922 if (!ri->regstclass) {
6923 ssc_init(pRExC_state, &ch_class);
6924 data.start_class = &ch_class;
6925 stclass_flag = SCF_DO_STCLASS_AND;
6926 } else /* XXXX Check for BOUND? */
6928 data.last_closep = &last_close;
6931 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6932 scan + RExC_size, /* Up to end */
6934 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6935 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6939 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6942 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6943 && data.last_start_min == 0 && data.last_end > 0
6944 && !RExC_seen_zerolen
6945 && !(RExC_seen & REG_VERBARG_SEEN)
6946 && !(RExC_seen & REG_GPOS_SEEN)
6948 r->extflags |= RXf_CHECK_ALL;
6950 scan_commit(pRExC_state, &data,&minlen,0);
6952 longest_float_length = CHR_SVLEN(data.longest_float);
6954 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6955 && data.offset_fixed == data.offset_float_min
6956 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6957 && S_setup_longest (aTHX_ pRExC_state,
6961 &(r->float_end_shift),
6962 data.lookbehind_float,
6963 data.offset_float_min,
6965 longest_float_length,
6966 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6967 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6969 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6970 r->float_max_offset = data.offset_float_max;
6971 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6972 r->float_max_offset -= data.lookbehind_float;
6973 SvREFCNT_inc_simple_void_NN(data.longest_float);
6976 r->float_substr = r->float_utf8 = NULL;
6977 longest_float_length = 0;
6980 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6982 if (S_setup_longest (aTHX_ pRExC_state,
6984 &(r->anchored_utf8),
6985 &(r->anchored_substr),
6986 &(r->anchored_end_shift),
6987 data.lookbehind_fixed,
6990 longest_fixed_length,
6991 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6992 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6994 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6995 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
6998 r->anchored_substr = r->anchored_utf8 = NULL;
6999 longest_fixed_length = 0;
7001 LEAVE_with_name("study_chunk");
7004 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7005 ri->regstclass = NULL;
7007 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7009 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7010 && !ssc_is_anything(data.start_class))
7012 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7014 ssc_finalize(pRExC_state, data.start_class);
7016 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7017 StructCopy(data.start_class,
7018 (regnode_ssc*)RExC_rxi->data->data[n],
7020 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7021 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7022 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7023 regprop(r, sv, (regnode*)data.start_class, NULL);
7024 PerlIO_printf(Perl_debug_log,
7025 "synthetic stclass \"%s\".\n",
7026 SvPVX_const(sv));});
7027 data.start_class = NULL;
7030 /* A temporary algorithm prefers floated substr to fixed one to dig
7032 if (longest_fixed_length > longest_float_length) {
7033 r->substrs->check_ix = 0;
7034 r->check_end_shift = r->anchored_end_shift;
7035 r->check_substr = r->anchored_substr;
7036 r->check_utf8 = r->anchored_utf8;
7037 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7038 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7039 r->intflags |= PREGf_NOSCAN;
7042 r->substrs->check_ix = 1;
7043 r->check_end_shift = r->float_end_shift;
7044 r->check_substr = r->float_substr;
7045 r->check_utf8 = r->float_utf8;
7046 r->check_offset_min = r->float_min_offset;
7047 r->check_offset_max = r->float_max_offset;
7049 if ((r->check_substr || r->check_utf8) ) {
7050 r->extflags |= RXf_USE_INTUIT;
7051 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7052 r->extflags |= RXf_INTUIT_TAIL;
7054 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7056 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7057 if ( (STRLEN)minlen < longest_float_length )
7058 minlen= longest_float_length;
7059 if ( (STRLEN)minlen < longest_fixed_length )
7060 minlen= longest_fixed_length;
7064 /* Several toplevels. Best we can is to set minlen. */
7066 regnode_ssc ch_class;
7067 SSize_t last_close = 0;
7069 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7071 scan = ri->program + 1;
7072 ssc_init(pRExC_state, &ch_class);
7073 data.start_class = &ch_class;
7074 data.last_closep = &last_close;
7077 minlen = study_chunk(pRExC_state,
7078 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7079 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7080 ? SCF_TRIE_DOING_RESTUDY
7084 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7086 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7087 = r->float_substr = r->float_utf8 = NULL;
7089 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7090 && ! ssc_is_anything(data.start_class))
7092 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7094 ssc_finalize(pRExC_state, data.start_class);
7096 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7097 StructCopy(data.start_class,
7098 (regnode_ssc*)RExC_rxi->data->data[n],
7100 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7101 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7102 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7103 regprop(r, sv, (regnode*)data.start_class, NULL);
7104 PerlIO_printf(Perl_debug_log,
7105 "synthetic stclass \"%s\".\n",
7106 SvPVX_const(sv));});
7107 data.start_class = NULL;
7111 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7112 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7113 r->maxlen = REG_INFTY;
7116 r->maxlen = RExC_maxlen;
7119 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7120 the "real" pattern. */
7122 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7123 (IV)minlen, (IV)r->minlen, RExC_maxlen);
7125 r->minlenret = minlen;
7126 if (r->minlen < minlen)
7129 if (RExC_seen & REG_GPOS_SEEN)
7130 r->intflags |= PREGf_GPOS_SEEN;
7131 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7132 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7134 if (pRExC_state->num_code_blocks)
7135 r->extflags |= RXf_EVAL_SEEN;
7136 if (RExC_seen & REG_CANY_SEEN)
7137 r->intflags |= PREGf_CANY_SEEN;
7138 if (RExC_seen & REG_VERBARG_SEEN)
7140 r->intflags |= PREGf_VERBARG_SEEN;
7141 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7143 if (RExC_seen & REG_CUTGROUP_SEEN)
7144 r->intflags |= PREGf_CUTGROUP_SEEN;
7145 if (pm_flags & PMf_USE_RE_EVAL)
7146 r->intflags |= PREGf_USE_RE_EVAL;
7147 if (RExC_paren_names)
7148 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7150 RXp_PAREN_NAMES(r) = NULL;
7152 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7153 * so it can be used in pp.c */
7154 if (r->intflags & PREGf_ANCH)
7155 r->extflags |= RXf_IS_ANCHORED;
7159 /* this is used to identify "special" patterns that might result
7160 * in Perl NOT calling the regex engine and instead doing the match "itself",
7161 * particularly special cases in split//. By having the regex compiler
7162 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7163 * we avoid weird issues with equivalent patterns resulting in different behavior,
7164 * AND we allow non Perl engines to get the same optimizations by the setting the
7165 * flags appropriately - Yves */
7166 regnode *first = ri->program + 1;
7168 regnode *next = NEXTOPER(first);
7171 if (PL_regkind[fop] == NOTHING && nop == END)
7172 r->extflags |= RXf_NULL;
7173 else if (PL_regkind[fop] == BOL && nop == END)
7174 r->extflags |= RXf_START_ONLY;
7175 else if (fop == PLUS
7176 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7177 && OP(regnext(first)) == END)
7178 r->extflags |= RXf_WHITE;
7179 else if ( r->extflags & RXf_SPLIT
7181 && STR_LEN(first) == 1
7182 && *(STRING(first)) == ' '
7183 && OP(regnext(first)) == END )
7184 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7188 if (RExC_contains_locale) {
7189 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7193 if (RExC_paren_names) {
7194 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7195 ri->data->data[ri->name_list_idx]
7196 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7199 ri->name_list_idx = 0;
7201 if (RExC_recurse_count) {
7202 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7203 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7204 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7207 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7208 /* assume we don't need to swap parens around before we match */
7212 PerlIO_printf(Perl_debug_log,"Final program:\n");
7215 #ifdef RE_TRACK_PATTERN_OFFSETS
7216 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7217 const STRLEN len = ri->u.offsets[0];
7219 GET_RE_DEBUG_FLAGS_DECL;
7220 PerlIO_printf(Perl_debug_log,
7221 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7222 for (i = 1; i <= len; i++) {
7223 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7224 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7225 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7227 PerlIO_printf(Perl_debug_log, "\n");
7232 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7233 * by setting the regexp SV to readonly-only instead. If the
7234 * pattern's been recompiled, the USEDness should remain. */
7235 if (old_re && SvREADONLY(old_re))
7243 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7246 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7248 PERL_UNUSED_ARG(value);
7250 if (flags & RXapif_FETCH) {
7251 return reg_named_buff_fetch(rx, key, flags);
7252 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7253 Perl_croak_no_modify();
7255 } else if (flags & RXapif_EXISTS) {
7256 return reg_named_buff_exists(rx, key, flags)
7259 } else if (flags & RXapif_REGNAMES) {
7260 return reg_named_buff_all(rx, flags);
7261 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7262 return reg_named_buff_scalar(rx, flags);
7264 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7270 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7273 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7274 PERL_UNUSED_ARG(lastkey);
7276 if (flags & RXapif_FIRSTKEY)
7277 return reg_named_buff_firstkey(rx, flags);
7278 else if (flags & RXapif_NEXTKEY)
7279 return reg_named_buff_nextkey(rx, flags);
7281 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7288 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7291 AV *retarray = NULL;
7293 struct regexp *const rx = ReANY(r);
7295 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7297 if (flags & RXapif_ALL)
7300 if (rx && RXp_PAREN_NAMES(rx)) {
7301 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7304 SV* sv_dat=HeVAL(he_str);
7305 I32 *nums=(I32*)SvPVX(sv_dat);
7306 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7307 if ((I32)(rx->nparens) >= nums[i]
7308 && rx->offs[nums[i]].start != -1
7309 && rx->offs[nums[i]].end != -1)
7312 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7317 ret = newSVsv(&PL_sv_undef);
7320 av_push(retarray, ret);
7323 return newRV_noinc(MUTABLE_SV(retarray));
7330 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7333 struct regexp *const rx = ReANY(r);
7335 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7337 if (rx && RXp_PAREN_NAMES(rx)) {
7338 if (flags & RXapif_ALL) {
7339 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7341 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7343 SvREFCNT_dec_NN(sv);
7355 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7357 struct regexp *const rx = ReANY(r);
7359 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7361 if ( rx && RXp_PAREN_NAMES(rx) ) {
7362 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7364 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7371 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7373 struct regexp *const rx = ReANY(r);
7374 GET_RE_DEBUG_FLAGS_DECL;
7376 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7378 if (rx && RXp_PAREN_NAMES(rx)) {
7379 HV *hv = RXp_PAREN_NAMES(rx);
7381 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7384 SV* sv_dat = HeVAL(temphe);
7385 I32 *nums = (I32*)SvPVX(sv_dat);
7386 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7387 if ((I32)(rx->lastparen) >= nums[i] &&
7388 rx->offs[nums[i]].start != -1 &&
7389 rx->offs[nums[i]].end != -1)
7395 if (parno || flags & RXapif_ALL) {
7396 return newSVhek(HeKEY_hek(temphe));
7404 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7409 struct regexp *const rx = ReANY(r);
7411 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7413 if (rx && RXp_PAREN_NAMES(rx)) {
7414 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7415 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7416 } else if (flags & RXapif_ONE) {
7417 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7418 av = MUTABLE_AV(SvRV(ret));
7419 length = av_tindex(av);
7420 SvREFCNT_dec_NN(ret);
7421 return newSViv(length + 1);
7423 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7428 return &PL_sv_undef;
7432 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7434 struct regexp *const rx = ReANY(r);
7437 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7439 if (rx && RXp_PAREN_NAMES(rx)) {
7440 HV *hv= RXp_PAREN_NAMES(rx);
7442 (void)hv_iterinit(hv);
7443 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7446 SV* sv_dat = HeVAL(temphe);
7447 I32 *nums = (I32*)SvPVX(sv_dat);
7448 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7449 if ((I32)(rx->lastparen) >= nums[i] &&
7450 rx->offs[nums[i]].start != -1 &&
7451 rx->offs[nums[i]].end != -1)
7457 if (parno || flags & RXapif_ALL) {
7458 av_push(av, newSVhek(HeKEY_hek(temphe)));
7463 return newRV_noinc(MUTABLE_SV(av));
7467 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7470 struct regexp *const rx = ReANY(r);
7476 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7478 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7479 || n == RX_BUFF_IDX_CARET_FULLMATCH
7480 || n == RX_BUFF_IDX_CARET_POSTMATCH
7483 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7485 /* on something like
7488 * the KEEPCOPY is set on the PMOP rather than the regex */
7489 if (PL_curpm && r == PM_GETRE(PL_curpm))
7490 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7499 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7500 /* no need to distinguish between them any more */
7501 n = RX_BUFF_IDX_FULLMATCH;
7503 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7504 && rx->offs[0].start != -1)
7506 /* $`, ${^PREMATCH} */
7507 i = rx->offs[0].start;
7511 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7512 && rx->offs[0].end != -1)
7514 /* $', ${^POSTMATCH} */
7515 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7516 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7519 if ( 0 <= n && n <= (I32)rx->nparens &&
7520 (s1 = rx->offs[n].start) != -1 &&
7521 (t1 = rx->offs[n].end) != -1)
7523 /* $&, ${^MATCH}, $1 ... */
7525 s = rx->subbeg + s1 - rx->suboffset;
7530 assert(s >= rx->subbeg);
7531 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7533 #ifdef NO_TAINT_SUPPORT
7534 sv_setpvn(sv, s, i);
7536 const int oldtainted = TAINT_get;
7538 sv_setpvn(sv, s, i);
7539 TAINT_set(oldtainted);
7541 if ( (rx->intflags & PREGf_CANY_SEEN)
7542 ? (RXp_MATCH_UTF8(rx)
7543 && (!i || is_utf8_string((U8*)s, i)))
7544 : (RXp_MATCH_UTF8(rx)) )
7551 if (RXp_MATCH_TAINTED(rx)) {
7552 if (SvTYPE(sv) >= SVt_PVMG) {
7553 MAGIC* const mg = SvMAGIC(sv);
7556 SvMAGIC_set(sv, mg->mg_moremagic);
7558 if ((mgt = SvMAGIC(sv))) {
7559 mg->mg_moremagic = mgt;
7560 SvMAGIC_set(sv, mg);
7571 sv_setsv(sv,&PL_sv_undef);
7577 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7578 SV const * const value)
7580 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7582 PERL_UNUSED_ARG(rx);
7583 PERL_UNUSED_ARG(paren);
7584 PERL_UNUSED_ARG(value);
7587 Perl_croak_no_modify();
7591 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7594 struct regexp *const rx = ReANY(r);
7598 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7600 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7601 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7602 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7605 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7607 /* on something like
7610 * the KEEPCOPY is set on the PMOP rather than the regex */
7611 if (PL_curpm && r == PM_GETRE(PL_curpm))
7612 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7618 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7620 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7621 case RX_BUFF_IDX_PREMATCH: /* $` */
7622 if (rx->offs[0].start != -1) {
7623 i = rx->offs[0].start;
7632 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7633 case RX_BUFF_IDX_POSTMATCH: /* $' */
7634 if (rx->offs[0].end != -1) {
7635 i = rx->sublen - rx->offs[0].end;
7637 s1 = rx->offs[0].end;
7644 default: /* $& / ${^MATCH}, $1, $2, ... */
7645 if (paren <= (I32)rx->nparens &&
7646 (s1 = rx->offs[paren].start) != -1 &&
7647 (t1 = rx->offs[paren].end) != -1)
7653 if (ckWARN(WARN_UNINITIALIZED))
7654 report_uninit((const SV *)sv);
7659 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7660 const char * const s = rx->subbeg - rx->suboffset + s1;
7665 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7672 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7674 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7675 PERL_UNUSED_ARG(rx);
7679 return newSVpvs("Regexp");
7682 /* Scans the name of a named buffer from the pattern.
7683 * If flags is REG_RSN_RETURN_NULL returns null.
7684 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7685 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7686 * to the parsed name as looked up in the RExC_paren_names hash.
7687 * If there is an error throws a vFAIL().. type exception.
7690 #define REG_RSN_RETURN_NULL 0
7691 #define REG_RSN_RETURN_NAME 1
7692 #define REG_RSN_RETURN_DATA 2
7695 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7697 char *name_start = RExC_parse;
7699 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7701 assert (RExC_parse <= RExC_end);
7702 if (RExC_parse == RExC_end) NOOP;
7703 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7704 /* skip IDFIRST by using do...while */
7707 RExC_parse += UTF8SKIP(RExC_parse);
7708 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7712 } while (isWORDCHAR(*RExC_parse));
7714 RExC_parse++; /* so the <- from the vFAIL is after the offending
7716 vFAIL("Group name must start with a non-digit word character");
7720 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7721 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7722 if ( flags == REG_RSN_RETURN_NAME)
7724 else if (flags==REG_RSN_RETURN_DATA) {
7727 if ( ! sv_name ) /* should not happen*/
7728 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7729 if (RExC_paren_names)
7730 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7732 sv_dat = HeVAL(he_str);
7734 vFAIL("Reference to nonexistent named group");
7738 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7739 (unsigned long) flags);
7741 assert(0); /* NOT REACHED */
7746 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7747 int rem=(int)(RExC_end - RExC_parse); \
7756 if (RExC_lastparse!=RExC_parse) \
7757 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7760 iscut ? "..." : "<" \
7763 PerlIO_printf(Perl_debug_log,"%16s",""); \
7766 num = RExC_size + 1; \
7768 num=REG_NODE_NUM(RExC_emit); \
7769 if (RExC_lastnum!=num) \
7770 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7772 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7773 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7774 (int)((depth*2)), "", \
7778 RExC_lastparse=RExC_parse; \
7783 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7784 DEBUG_PARSE_MSG((funcname)); \
7785 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7787 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7788 DEBUG_PARSE_MSG((funcname)); \
7789 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7792 /* This section of code defines the inversion list object and its methods. The
7793 * interfaces are highly subject to change, so as much as possible is static to
7794 * this file. An inversion list is here implemented as a malloc'd C UV array
7795 * as an SVt_INVLIST scalar.
7797 * An inversion list for Unicode is an array of code points, sorted by ordinal
7798 * number. The zeroth element is the first code point in the list. The 1th
7799 * element is the first element beyond that not in the list. In other words,
7800 * the first range is
7801 * invlist[0]..(invlist[1]-1)
7802 * The other ranges follow. Thus every element whose index is divisible by two
7803 * marks the beginning of a range that is in the list, and every element not
7804 * divisible by two marks the beginning of a range not in the list. A single
7805 * element inversion list that contains the single code point N generally
7806 * consists of two elements
7809 * (The exception is when N is the highest representable value on the
7810 * machine, in which case the list containing just it would be a single
7811 * element, itself. By extension, if the last range in the list extends to
7812 * infinity, then the first element of that range will be in the inversion list
7813 * at a position that is divisible by two, and is the final element in the
7815 * Taking the complement (inverting) an inversion list is quite simple, if the
7816 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7817 * This implementation reserves an element at the beginning of each inversion
7818 * list to always contain 0; there is an additional flag in the header which
7819 * indicates if the list begins at the 0, or is offset to begin at the next
7822 * More about inversion lists can be found in "Unicode Demystified"
7823 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7824 * More will be coming when functionality is added later.
7826 * The inversion list data structure is currently implemented as an SV pointing
7827 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7828 * array of UV whose memory management is automatically handled by the existing
7829 * facilities for SV's.
7831 * Some of the methods should always be private to the implementation, and some
7832 * should eventually be made public */
7834 /* The header definitions are in F<inline_invlist.c> */
7836 PERL_STATIC_INLINE UV*
7837 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7839 /* Returns a pointer to the first element in the inversion list's array.
7840 * This is called upon initialization of an inversion list. Where the
7841 * array begins depends on whether the list has the code point U+0000 in it
7842 * or not. The other parameter tells it whether the code that follows this
7843 * call is about to put a 0 in the inversion list or not. The first
7844 * element is either the element reserved for 0, if TRUE, or the element
7845 * after it, if FALSE */
7847 bool* offset = get_invlist_offset_addr(invlist);
7848 UV* zero_addr = (UV *) SvPVX(invlist);
7850 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7853 assert(! _invlist_len(invlist));
7857 /* 1^1 = 0; 1^0 = 1 */
7858 *offset = 1 ^ will_have_0;
7859 return zero_addr + *offset;
7862 PERL_STATIC_INLINE UV*
7863 S_invlist_array(SV* const invlist)
7865 /* Returns the pointer to the inversion list's array. Every time the
7866 * length changes, this needs to be called in case malloc or realloc moved
7869 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7871 /* Must not be empty. If these fail, you probably didn't check for <len>
7872 * being non-zero before trying to get the array */
7873 assert(_invlist_len(invlist));
7875 /* The very first element always contains zero, The array begins either
7876 * there, or if the inversion list is offset, at the element after it.
7877 * The offset header field determines which; it contains 0 or 1 to indicate
7878 * how much additionally to add */
7879 assert(0 == *(SvPVX(invlist)));
7880 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7883 PERL_STATIC_INLINE void
7884 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7886 /* Sets the current number of elements stored in the inversion list.
7887 * Updates SvCUR correspondingly */
7888 PERL_UNUSED_CONTEXT;
7889 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7891 assert(SvTYPE(invlist) == SVt_INVLIST);
7896 : TO_INTERNAL_SIZE(len + offset));
7897 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7900 PERL_STATIC_INLINE IV*
7901 S_get_invlist_previous_index_addr(SV* invlist)
7903 /* Return the address of the IV that is reserved to hold the cached index
7905 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7907 assert(SvTYPE(invlist) == SVt_INVLIST);
7909 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7912 PERL_STATIC_INLINE IV
7913 S_invlist_previous_index(SV* const invlist)
7915 /* Returns cached index of previous search */
7917 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7919 return *get_invlist_previous_index_addr(invlist);
7922 PERL_STATIC_INLINE void
7923 S_invlist_set_previous_index(SV* const invlist, const IV index)
7925 /* Caches <index> for later retrieval */
7927 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7929 assert(index == 0 || index < (int) _invlist_len(invlist));
7931 *get_invlist_previous_index_addr(invlist) = index;
7934 PERL_STATIC_INLINE UV
7935 S_invlist_max(SV* const invlist)
7937 /* Returns the maximum number of elements storable in the inversion list's
7938 * array, without having to realloc() */
7940 PERL_ARGS_ASSERT_INVLIST_MAX;
7942 assert(SvTYPE(invlist) == SVt_INVLIST);
7944 /* Assumes worst case, in which the 0 element is not counted in the
7945 * inversion list, so subtracts 1 for that */
7946 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7947 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7948 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7951 #ifndef PERL_IN_XSUB_RE
7953 Perl__new_invlist(pTHX_ IV initial_size)
7956 /* Return a pointer to a newly constructed inversion list, with enough
7957 * space to store 'initial_size' elements. If that number is negative, a
7958 * system default is used instead */
7962 if (initial_size < 0) {
7966 /* Allocate the initial space */
7967 new_list = newSV_type(SVt_INVLIST);
7969 /* First 1 is in case the zero element isn't in the list; second 1 is for
7971 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7972 invlist_set_len(new_list, 0, 0);
7974 /* Force iterinit() to be used to get iteration to work */
7975 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7977 *get_invlist_previous_index_addr(new_list) = 0;
7983 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7985 /* Return a pointer to a newly constructed inversion list, initialized to
7986 * point to <list>, which has to be in the exact correct inversion list
7987 * form, including internal fields. Thus this is a dangerous routine that
7988 * should not be used in the wrong hands. The passed in 'list' contains
7989 * several header fields at the beginning that are not part of the
7990 * inversion list body proper */
7992 const STRLEN length = (STRLEN) list[0];
7993 const UV version_id = list[1];
7994 const bool offset = cBOOL(list[2]);
7995 #define HEADER_LENGTH 3
7996 /* If any of the above changes in any way, you must change HEADER_LENGTH
7997 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
7998 * perl -E 'say int(rand 2**31-1)'
8000 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8001 data structure type, so that one being
8002 passed in can be validated to be an
8003 inversion list of the correct vintage.
8006 SV* invlist = newSV_type(SVt_INVLIST);
8008 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8010 if (version_id != INVLIST_VERSION_ID) {
8011 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8014 /* The generated array passed in includes header elements that aren't part
8015 * of the list proper, so start it just after them */
8016 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8018 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8019 shouldn't touch it */
8021 *(get_invlist_offset_addr(invlist)) = offset;
8023 /* The 'length' passed to us is the physical number of elements in the
8024 * inversion list. But if there is an offset the logical number is one
8026 invlist_set_len(invlist, length - offset, offset);
8028 invlist_set_previous_index(invlist, 0);
8030 /* Initialize the iteration pointer. */
8031 invlist_iterfinish(invlist);
8033 SvREADONLY_on(invlist);
8037 #endif /* ifndef PERL_IN_XSUB_RE */
8040 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8042 /* Grow the maximum size of an inversion list */
8044 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8046 assert(SvTYPE(invlist) == SVt_INVLIST);
8048 /* Add one to account for the zero element at the beginning which may not
8049 * be counted by the calling parameters */
8050 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8053 PERL_STATIC_INLINE void
8054 S_invlist_trim(SV* const invlist)
8056 PERL_ARGS_ASSERT_INVLIST_TRIM;
8058 assert(SvTYPE(invlist) == SVt_INVLIST);
8060 /* Change the length of the inversion list to how many entries it currently
8062 SvPV_shrink_to_cur((SV *) invlist);
8066 S__append_range_to_invlist(pTHX_ SV* const invlist,
8067 const UV start, const UV end)
8069 /* Subject to change or removal. Append the range from 'start' to 'end' at
8070 * the end of the inversion list. The range must be above any existing
8074 UV max = invlist_max(invlist);
8075 UV len = _invlist_len(invlist);
8078 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8080 if (len == 0) { /* Empty lists must be initialized */
8081 offset = start != 0;
8082 array = _invlist_array_init(invlist, ! offset);
8085 /* Here, the existing list is non-empty. The current max entry in the
8086 * list is generally the first value not in the set, except when the
8087 * set extends to the end of permissible values, in which case it is
8088 * the first entry in that final set, and so this call is an attempt to
8089 * append out-of-order */
8091 UV final_element = len - 1;
8092 array = invlist_array(invlist);
8093 if (array[final_element] > start
8094 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8096 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8097 array[final_element], start,
8098 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8101 /* Here, it is a legal append. If the new range begins with the first
8102 * value not in the set, it is extending the set, so the new first
8103 * value not in the set is one greater than the newly extended range.
8105 offset = *get_invlist_offset_addr(invlist);
8106 if (array[final_element] == start) {
8107 if (end != UV_MAX) {
8108 array[final_element] = end + 1;
8111 /* But if the end is the maximum representable on the machine,
8112 * just let the range that this would extend to have no end */
8113 invlist_set_len(invlist, len - 1, offset);
8119 /* Here the new range doesn't extend any existing set. Add it */
8121 len += 2; /* Includes an element each for the start and end of range */
8123 /* If wll overflow the existing space, extend, which may cause the array to
8126 invlist_extend(invlist, len);
8128 /* Have to set len here to avoid assert failure in invlist_array() */
8129 invlist_set_len(invlist, len, offset);
8131 array = invlist_array(invlist);
8134 invlist_set_len(invlist, len, offset);
8137 /* The next item on the list starts the range, the one after that is
8138 * one past the new range. */
8139 array[len - 2] = start;
8140 if (end != UV_MAX) {
8141 array[len - 1] = end + 1;
8144 /* But if the end is the maximum representable on the machine, just let
8145 * the range have no end */
8146 invlist_set_len(invlist, len - 1, offset);
8150 #ifndef PERL_IN_XSUB_RE
8153 Perl__invlist_search(SV* const invlist, const UV cp)
8155 /* Searches the inversion list for the entry that contains the input code
8156 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8157 * return value is the index into the list's array of the range that
8162 IV high = _invlist_len(invlist);
8163 const IV highest_element = high - 1;
8166 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8168 /* If list is empty, return failure. */
8173 /* (We can't get the array unless we know the list is non-empty) */
8174 array = invlist_array(invlist);
8176 mid = invlist_previous_index(invlist);
8177 assert(mid >=0 && mid <= highest_element);
8179 /* <mid> contains the cache of the result of the previous call to this
8180 * function (0 the first time). See if this call is for the same result,
8181 * or if it is for mid-1. This is under the theory that calls to this
8182 * function will often be for related code points that are near each other.
8183 * And benchmarks show that caching gives better results. We also test
8184 * here if the code point is within the bounds of the list. These tests
8185 * replace others that would have had to be made anyway to make sure that
8186 * the array bounds were not exceeded, and these give us extra information
8187 * at the same time */
8188 if (cp >= array[mid]) {
8189 if (cp >= array[highest_element]) {
8190 return highest_element;
8193 /* Here, array[mid] <= cp < array[highest_element]. This means that
8194 * the final element is not the answer, so can exclude it; it also
8195 * means that <mid> is not the final element, so can refer to 'mid + 1'
8197 if (cp < array[mid + 1]) {
8203 else { /* cp < aray[mid] */
8204 if (cp < array[0]) { /* Fail if outside the array */
8208 if (cp >= array[mid - 1]) {
8213 /* Binary search. What we are looking for is <i> such that
8214 * array[i] <= cp < array[i+1]
8215 * The loop below converges on the i+1. Note that there may not be an
8216 * (i+1)th element in the array, and things work nonetheless */
8217 while (low < high) {
8218 mid = (low + high) / 2;
8219 assert(mid <= highest_element);
8220 if (array[mid] <= cp) { /* cp >= array[mid] */
8223 /* We could do this extra test to exit the loop early.
8224 if (cp < array[low]) {
8229 else { /* cp < array[mid] */
8236 invlist_set_previous_index(invlist, high);
8241 Perl__invlist_populate_swatch(SV* const invlist,
8242 const UV start, const UV end, U8* swatch)
8244 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8245 * but is used when the swash has an inversion list. This makes this much
8246 * faster, as it uses a binary search instead of a linear one. This is
8247 * intimately tied to that function, and perhaps should be in utf8.c,
8248 * except it is intimately tied to inversion lists as well. It assumes
8249 * that <swatch> is all 0's on input */
8252 const IV len = _invlist_len(invlist);
8256 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8258 if (len == 0) { /* Empty inversion list */
8262 array = invlist_array(invlist);
8264 /* Find which element it is */
8265 i = _invlist_search(invlist, start);
8267 /* We populate from <start> to <end> */
8268 while (current < end) {
8271 /* The inversion list gives the results for every possible code point
8272 * after the first one in the list. Only those ranges whose index is
8273 * even are ones that the inversion list matches. For the odd ones,
8274 * and if the initial code point is not in the list, we have to skip
8275 * forward to the next element */
8276 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8278 if (i >= len) { /* Finished if beyond the end of the array */
8282 if (current >= end) { /* Finished if beyond the end of what we
8284 if (LIKELY(end < UV_MAX)) {
8288 /* We get here when the upper bound is the maximum
8289 * representable on the machine, and we are looking for just
8290 * that code point. Have to special case it */
8292 goto join_end_of_list;
8295 assert(current >= start);
8297 /* The current range ends one below the next one, except don't go past
8300 upper = (i < len && array[i] < end) ? array[i] : end;
8302 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8303 * for each code point in it */
8304 for (; current < upper; current++) {
8305 const STRLEN offset = (STRLEN)(current - start);
8306 swatch[offset >> 3] |= 1 << (offset & 7);
8311 /* Quit if at the end of the list */
8314 /* But first, have to deal with the highest possible code point on
8315 * the platform. The previous code assumes that <end> is one
8316 * beyond where we want to populate, but that is impossible at the
8317 * platform's infinity, so have to handle it specially */
8318 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8320 const STRLEN offset = (STRLEN)(end - start);
8321 swatch[offset >> 3] |= 1 << (offset & 7);
8326 /* Advance to the next range, which will be for code points not in the
8335 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8336 const bool complement_b, SV** output)
8338 /* Take the union of two inversion lists and point <output> to it. *output
8339 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8340 * the reference count to that list will be decremented if not already a
8341 * temporary (mortal); otherwise *output will be made correspondingly
8342 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8343 * second list is returned. If <complement_b> is TRUE, the union is taken
8344 * of the complement (inversion) of <b> instead of b itself.
8346 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8347 * Richard Gillam, published by Addison-Wesley, and explained at some
8348 * length there. The preface says to incorporate its examples into your
8349 * code at your own risk.
8351 * The algorithm is like a merge sort.
8353 * XXX A potential performance improvement is to keep track as we go along
8354 * if only one of the inputs contributes to the result, meaning the other
8355 * is a subset of that one. In that case, we can skip the final copy and
8356 * return the larger of the input lists, but then outside code might need
8357 * to keep track of whether to free the input list or not */
8359 const UV* array_a; /* a's array */
8361 UV len_a; /* length of a's array */
8364 SV* u; /* the resulting union */
8368 UV i_a = 0; /* current index into a's array */
8372 /* running count, as explained in the algorithm source book; items are
8373 * stopped accumulating and are output when the count changes to/from 0.
8374 * The count is incremented when we start a range that's in the set, and
8375 * decremented when we start a range that's not in the set. So its range
8376 * is 0 to 2. Only when the count is zero is something not in the set.
8380 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8383 /* If either one is empty, the union is the other one */
8384 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8385 bool make_temp = FALSE; /* Should we mortalize the result? */
8389 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8395 *output = invlist_clone(b);
8397 _invlist_invert(*output);
8399 } /* else *output already = b; */
8402 sv_2mortal(*output);
8406 else if ((len_b = _invlist_len(b)) == 0) {
8407 bool make_temp = FALSE;
8409 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8414 /* The complement of an empty list is a list that has everything in it,
8415 * so the union with <a> includes everything too */
8418 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8422 *output = _new_invlist(1);
8423 _append_range_to_invlist(*output, 0, UV_MAX);
8425 else if (*output != a) {
8426 *output = invlist_clone(a);
8428 /* else *output already = a; */
8431 sv_2mortal(*output);
8436 /* Here both lists exist and are non-empty */
8437 array_a = invlist_array(a);
8438 array_b = invlist_array(b);
8440 /* If are to take the union of 'a' with the complement of b, set it
8441 * up so are looking at b's complement. */
8444 /* To complement, we invert: if the first element is 0, remove it. To
8445 * do this, we just pretend the array starts one later */
8446 if (array_b[0] == 0) {
8452 /* But if the first element is not zero, we pretend the list starts
8453 * at the 0 that is always stored immediately before the array. */
8459 /* Size the union for the worst case: that the sets are completely
8461 u = _new_invlist(len_a + len_b);
8463 /* Will contain U+0000 if either component does */
8464 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8465 || (len_b > 0 && array_b[0] == 0));
8467 /* Go through each list item by item, stopping when exhausted one of
8469 while (i_a < len_a && i_b < len_b) {
8470 UV cp; /* The element to potentially add to the union's array */
8471 bool cp_in_set; /* is it in the the input list's set or not */
8473 /* We need to take one or the other of the two inputs for the union.
8474 * Since we are merging two sorted lists, we take the smaller of the
8475 * next items. In case of a tie, we take the one that is in its set
8476 * first. If we took one not in the set first, it would decrement the
8477 * count, possibly to 0 which would cause it to be output as ending the
8478 * range, and the next time through we would take the same number, and
8479 * output it again as beginning the next range. By doing it the
8480 * opposite way, there is no possibility that the count will be
8481 * momentarily decremented to 0, and thus the two adjoining ranges will
8482 * be seamlessly merged. (In a tie and both are in the set or both not
8483 * in the set, it doesn't matter which we take first.) */
8484 if (array_a[i_a] < array_b[i_b]
8485 || (array_a[i_a] == array_b[i_b]
8486 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8488 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8492 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8493 cp = array_b[i_b++];
8496 /* Here, have chosen which of the two inputs to look at. Only output
8497 * if the running count changes to/from 0, which marks the
8498 * beginning/end of a range in that's in the set */
8501 array_u[i_u++] = cp;
8508 array_u[i_u++] = cp;
8513 /* Here, we are finished going through at least one of the lists, which
8514 * means there is something remaining in at most one. We check if the list
8515 * that hasn't been exhausted is positioned such that we are in the middle
8516 * of a range in its set or not. (i_a and i_b point to the element beyond
8517 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8518 * is potentially more to output.
8519 * There are four cases:
8520 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8521 * in the union is entirely from the non-exhausted set.
8522 * 2) Both were in their sets, count is 2. Nothing further should
8523 * be output, as everything that remains will be in the exhausted
8524 * list's set, hence in the union; decrementing to 1 but not 0 insures
8526 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8527 * Nothing further should be output because the union includes
8528 * everything from the exhausted set. Not decrementing ensures that.
8529 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8530 * decrementing to 0 insures that we look at the remainder of the
8531 * non-exhausted set */
8532 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8533 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8538 /* The final length is what we've output so far, plus what else is about to
8539 * be output. (If 'count' is non-zero, then the input list we exhausted
8540 * has everything remaining up to the machine's limit in its set, and hence
8541 * in the union, so there will be no further output. */
8544 /* At most one of the subexpressions will be non-zero */
8545 len_u += (len_a - i_a) + (len_b - i_b);
8548 /* Set result to final length, which can change the pointer to array_u, so
8550 if (len_u != _invlist_len(u)) {
8551 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8553 array_u = invlist_array(u);
8556 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8557 * the other) ended with everything above it not in its set. That means
8558 * that the remaining part of the union is precisely the same as the
8559 * non-exhausted list, so can just copy it unchanged. (If both list were
8560 * exhausted at the same time, then the operations below will be both 0.)
8563 IV copy_count; /* At most one will have a non-zero copy count */
8564 if ((copy_count = len_a - i_a) > 0) {
8565 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8567 else if ((copy_count = len_b - i_b) > 0) {
8568 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8572 /* We may be removing a reference to one of the inputs. If so, the output
8573 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8574 * count decremented) */
8575 if (a == *output || b == *output) {
8576 assert(! invlist_is_iterating(*output));
8577 if ((SvTEMP(*output))) {
8581 SvREFCNT_dec_NN(*output);
8591 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8592 const bool complement_b, SV** i)
8594 /* Take the intersection of two inversion lists and point <i> to it. *i
8595 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8596 * the reference count to that list will be decremented if not already a
8597 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8598 * The first list, <a>, may be NULL, in which case an empty list is
8599 * returned. If <complement_b> is TRUE, the result will be the
8600 * intersection of <a> and the complement (or inversion) of <b> instead of
8603 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8604 * Richard Gillam, published by Addison-Wesley, and explained at some
8605 * length there. The preface says to incorporate its examples into your
8606 * code at your own risk. In fact, it had bugs
8608 * The algorithm is like a merge sort, and is essentially the same as the
8612 const UV* array_a; /* a's array */
8614 UV len_a; /* length of a's array */
8617 SV* r; /* the resulting intersection */
8621 UV i_a = 0; /* current index into a's array */
8625 /* running count, as explained in the algorithm source book; items are
8626 * stopped accumulating and are output when the count changes to/from 2.
8627 * The count is incremented when we start a range that's in the set, and
8628 * decremented when we start a range that's not in the set. So its range
8629 * is 0 to 2. Only when the count is 2 is something in the intersection.
8633 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8636 /* Special case if either one is empty */
8637 len_a = (a == NULL) ? 0 : _invlist_len(a);
8638 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8639 bool make_temp = FALSE;
8641 if (len_a != 0 && complement_b) {
8643 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8644 * be empty. Here, also we are using 'b's complement, which hence
8645 * must be every possible code point. Thus the intersection is
8649 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8654 *i = invlist_clone(a);
8656 /* else *i is already 'a' */
8664 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8665 * intersection must be empty */
8667 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8672 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8676 *i = _new_invlist(0);
8684 /* Here both lists exist and are non-empty */
8685 array_a = invlist_array(a);
8686 array_b = invlist_array(b);
8688 /* If are to take the intersection of 'a' with the complement of b, set it
8689 * up so are looking at b's complement. */
8692 /* To complement, we invert: if the first element is 0, remove it. To
8693 * do this, we just pretend the array starts one later */
8694 if (array_b[0] == 0) {
8700 /* But if the first element is not zero, we pretend the list starts
8701 * at the 0 that is always stored immediately before the array. */
8707 /* Size the intersection for the worst case: that the intersection ends up
8708 * fragmenting everything to be completely disjoint */
8709 r= _new_invlist(len_a + len_b);
8711 /* Will contain U+0000 iff both components do */
8712 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8713 && len_b > 0 && array_b[0] == 0);
8715 /* Go through each list item by item, stopping when exhausted one of
8717 while (i_a < len_a && i_b < len_b) {
8718 UV cp; /* The element to potentially add to the intersection's
8720 bool cp_in_set; /* Is it in the input list's set or not */
8722 /* We need to take one or the other of the two inputs for the
8723 * intersection. Since we are merging two sorted lists, we take the
8724 * smaller of the next items. In case of a tie, we take the one that
8725 * is not in its set first (a difference from the union algorithm). If
8726 * we took one in the set first, it would increment the count, possibly
8727 * to 2 which would cause it to be output as starting a range in the
8728 * intersection, and the next time through we would take that same
8729 * number, and output it again as ending the set. By doing it the
8730 * opposite of this, there is no possibility that the count will be
8731 * momentarily incremented to 2. (In a tie and both are in the set or
8732 * both not in the set, it doesn't matter which we take first.) */
8733 if (array_a[i_a] < array_b[i_b]
8734 || (array_a[i_a] == array_b[i_b]
8735 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8737 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8741 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8745 /* Here, have chosen which of the two inputs to look at. Only output
8746 * if the running count changes to/from 2, which marks the
8747 * beginning/end of a range that's in the intersection */
8751 array_r[i_r++] = cp;
8756 array_r[i_r++] = cp;
8762 /* Here, we are finished going through at least one of the lists, which
8763 * means there is something remaining in at most one. We check if the list
8764 * that has been exhausted is positioned such that we are in the middle
8765 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8766 * the ones we care about.) There are four cases:
8767 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8768 * nothing left in the intersection.
8769 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8770 * above 2. What should be output is exactly that which is in the
8771 * non-exhausted set, as everything it has is also in the intersection
8772 * set, and everything it doesn't have can't be in the intersection
8773 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8774 * gets incremented to 2. Like the previous case, the intersection is
8775 * everything that remains in the non-exhausted set.
8776 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8777 * remains 1. And the intersection has nothing more. */
8778 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8779 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8784 /* The final length is what we've output so far plus what else is in the
8785 * intersection. At most one of the subexpressions below will be non-zero
8789 len_r += (len_a - i_a) + (len_b - i_b);
8792 /* Set result to final length, which can change the pointer to array_r, so
8794 if (len_r != _invlist_len(r)) {
8795 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8797 array_r = invlist_array(r);
8800 /* Finish outputting any remaining */
8801 if (count >= 2) { /* At most one will have a non-zero copy count */
8803 if ((copy_count = len_a - i_a) > 0) {
8804 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8806 else if ((copy_count = len_b - i_b) > 0) {
8807 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8811 /* We may be removing a reference to one of the inputs. If so, the output
8812 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8813 * count decremented) */
8814 if (a == *i || b == *i) {
8815 assert(! invlist_is_iterating(*i));
8820 SvREFCNT_dec_NN(*i);
8830 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8832 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8833 * set. A pointer to the inversion list is returned. This may actually be
8834 * a new list, in which case the passed in one has been destroyed. The
8835 * passed in inversion list can be NULL, in which case a new one is created
8836 * with just the one range in it */
8841 if (invlist == NULL) {
8842 invlist = _new_invlist(2);
8846 len = _invlist_len(invlist);
8849 /* If comes after the final entry actually in the list, can just append it
8852 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8853 && start >= invlist_array(invlist)[len - 1]))
8855 _append_range_to_invlist(invlist, start, end);
8859 /* Here, can't just append things, create and return a new inversion list
8860 * which is the union of this range and the existing inversion list */
8861 range_invlist = _new_invlist(2);
8862 _append_range_to_invlist(range_invlist, start, end);
8864 _invlist_union(invlist, range_invlist, &invlist);
8866 /* The temporary can be freed */
8867 SvREFCNT_dec_NN(range_invlist);
8873 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8874 UV** other_elements_ptr)
8876 /* Create and return an inversion list whose contents are to be populated
8877 * by the caller. The caller gives the number of elements (in 'size') and
8878 * the very first element ('element0'). This function will set
8879 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8882 * Obviously there is some trust involved that the caller will properly
8883 * fill in the other elements of the array.
8885 * (The first element needs to be passed in, as the underlying code does
8886 * things differently depending on whether it is zero or non-zero) */
8888 SV* invlist = _new_invlist(size);
8891 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8893 _append_range_to_invlist(invlist, element0, element0);
8894 offset = *get_invlist_offset_addr(invlist);
8896 invlist_set_len(invlist, size, offset);
8897 *other_elements_ptr = invlist_array(invlist) + 1;
8903 PERL_STATIC_INLINE SV*
8904 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8905 return _add_range_to_invlist(invlist, cp, cp);
8908 #ifndef PERL_IN_XSUB_RE
8910 Perl__invlist_invert(pTHX_ SV* const invlist)
8912 /* Complement the input inversion list. This adds a 0 if the list didn't
8913 * have a zero; removes it otherwise. As described above, the data
8914 * structure is set up so that this is very efficient */
8916 PERL_ARGS_ASSERT__INVLIST_INVERT;
8918 assert(! invlist_is_iterating(invlist));
8920 /* The inverse of matching nothing is matching everything */
8921 if (_invlist_len(invlist) == 0) {
8922 _append_range_to_invlist(invlist, 0, UV_MAX);
8926 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8931 PERL_STATIC_INLINE SV*
8932 S_invlist_clone(pTHX_ SV* const invlist)
8935 /* Return a new inversion list that is a copy of the input one, which is
8936 * unchanged. The new list will not be mortal even if the old one was. */
8938 /* Need to allocate extra space to accommodate Perl's addition of a
8939 * trailing NUL to SvPV's, since it thinks they are always strings */
8940 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8941 STRLEN physical_length = SvCUR(invlist);
8942 bool offset = *(get_invlist_offset_addr(invlist));
8944 PERL_ARGS_ASSERT_INVLIST_CLONE;
8946 *(get_invlist_offset_addr(new_invlist)) = offset;
8947 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8948 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8953 PERL_STATIC_INLINE STRLEN*
8954 S_get_invlist_iter_addr(SV* invlist)
8956 /* Return the address of the UV that contains the current iteration
8959 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8961 assert(SvTYPE(invlist) == SVt_INVLIST);
8963 return &(((XINVLIST*) SvANY(invlist))->iterator);
8966 PERL_STATIC_INLINE void
8967 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8969 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8971 *get_invlist_iter_addr(invlist) = 0;
8974 PERL_STATIC_INLINE void
8975 S_invlist_iterfinish(SV* invlist)
8977 /* Terminate iterator for invlist. This is to catch development errors.
8978 * Any iteration that is interrupted before completed should call this
8979 * function. Functions that add code points anywhere else but to the end
8980 * of an inversion list assert that they are not in the middle of an
8981 * iteration. If they were, the addition would make the iteration
8982 * problematical: if the iteration hadn't reached the place where things
8983 * were being added, it would be ok */
8985 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8987 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8991 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8993 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8994 * This call sets in <*start> and <*end>, the next range in <invlist>.
8995 * Returns <TRUE> if successful and the next call will return the next
8996 * range; <FALSE> if was already at the end of the list. If the latter,
8997 * <*start> and <*end> are unchanged, and the next call to this function
8998 * will start over at the beginning of the list */
9000 STRLEN* pos = get_invlist_iter_addr(invlist);
9001 UV len = _invlist_len(invlist);
9004 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9007 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9011 array = invlist_array(invlist);
9013 *start = array[(*pos)++];
9019 *end = array[(*pos)++] - 1;
9025 PERL_STATIC_INLINE bool
9026 S_invlist_is_iterating(SV* const invlist)
9028 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9030 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9033 PERL_STATIC_INLINE UV
9034 S_invlist_highest(SV* const invlist)
9036 /* Returns the highest code point that matches an inversion list. This API
9037 * has an ambiguity, as it returns 0 under either the highest is actually
9038 * 0, or if the list is empty. If this distinction matters to you, check
9039 * for emptiness before calling this function */
9041 UV len = _invlist_len(invlist);
9044 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9050 array = invlist_array(invlist);
9052 /* The last element in the array in the inversion list always starts a
9053 * range that goes to infinity. That range may be for code points that are
9054 * matched in the inversion list, or it may be for ones that aren't
9055 * matched. In the latter case, the highest code point in the set is one
9056 * less than the beginning of this range; otherwise it is the final element
9057 * of this range: infinity */
9058 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9060 : array[len - 1] - 1;
9063 #ifndef PERL_IN_XSUB_RE
9065 Perl__invlist_contents(pTHX_ SV* const invlist)
9067 /* Get the contents of an inversion list into a string SV so that they can
9068 * be printed out. It uses the format traditionally done for debug tracing
9072 SV* output = newSVpvs("\n");
9074 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9076 assert(! invlist_is_iterating(invlist));
9078 invlist_iterinit(invlist);
9079 while (invlist_iternext(invlist, &start, &end)) {
9080 if (end == UV_MAX) {
9081 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9083 else if (end != start) {
9084 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9088 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9096 #ifndef PERL_IN_XSUB_RE
9098 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9099 const char * const indent, SV* const invlist)
9101 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9102 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9103 * the string 'indent'. The output looks like this:
9104 [0] 0x000A .. 0x000D
9106 [4] 0x2028 .. 0x2029
9107 [6] 0x3104 .. INFINITY
9108 * This means that the first range of code points matched by the list are
9109 * 0xA through 0xD; the second range contains only the single code point
9110 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9111 * are used to define each range (except if the final range extends to
9112 * infinity, only a single element is needed). The array index of the
9113 * first element for the corresponding range is given in brackets. */
9118 PERL_ARGS_ASSERT__INVLIST_DUMP;
9120 if (invlist_is_iterating(invlist)) {
9121 Perl_dump_indent(aTHX_ level, file,
9122 "%sCan't dump inversion list because is in middle of iterating\n",
9127 invlist_iterinit(invlist);
9128 while (invlist_iternext(invlist, &start, &end)) {
9129 if (end == UV_MAX) {
9130 Perl_dump_indent(aTHX_ level, file,
9131 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9132 indent, (UV)count, start);
9134 else if (end != start) {
9135 Perl_dump_indent(aTHX_ level, file,
9136 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9137 indent, (UV)count, start, end);
9140 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9141 indent, (UV)count, start);
9148 Perl__load_PL_utf8_foldclosures (pTHX)
9150 assert(! PL_utf8_foldclosures);
9152 /* If the folds haven't been read in, call a fold function
9154 if (! PL_utf8_tofold) {
9155 U8 dummy[UTF8_MAXBYTES_CASE+1];
9157 /* This string is just a short named one above \xff */
9158 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9159 assert(PL_utf8_tofold); /* Verify that worked */
9161 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9165 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9167 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9169 /* Return a boolean as to if the two passed in inversion lists are
9170 * identical. The final argument, if TRUE, says to take the complement of
9171 * the second inversion list before doing the comparison */
9173 const UV* array_a = invlist_array(a);
9174 const UV* array_b = invlist_array(b);
9175 UV len_a = _invlist_len(a);
9176 UV len_b = _invlist_len(b);
9178 UV i = 0; /* current index into the arrays */
9179 bool retval = TRUE; /* Assume are identical until proven otherwise */
9181 PERL_ARGS_ASSERT__INVLISTEQ;
9183 /* If are to compare 'a' with the complement of b, set it
9184 * up so are looking at b's complement. */
9187 /* The complement of nothing is everything, so <a> would have to have
9188 * just one element, starting at zero (ending at infinity) */
9190 return (len_a == 1 && array_a[0] == 0);
9192 else if (array_b[0] == 0) {
9194 /* Otherwise, to complement, we invert. Here, the first element is
9195 * 0, just remove it. To do this, we just pretend the array starts
9203 /* But if the first element is not zero, we pretend the list starts
9204 * at the 0 that is always stored immediately before the array. */
9210 /* Make sure that the lengths are the same, as well as the final element
9211 * before looping through the remainder. (Thus we test the length, final,
9212 * and first elements right off the bat) */
9213 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9216 else for (i = 0; i < len_a - 1; i++) {
9217 if (array_a[i] != array_b[i]) {
9227 #undef HEADER_LENGTH
9228 #undef TO_INTERNAL_SIZE
9229 #undef FROM_INTERNAL_SIZE
9230 #undef INVLIST_VERSION_ID
9232 /* End of inversion list object */
9235 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9237 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9238 * constructs, and updates RExC_flags with them. On input, RExC_parse
9239 * should point to the first flag; it is updated on output to point to the
9240 * final ')' or ':'. There needs to be at least one flag, or this will
9243 /* for (?g), (?gc), and (?o) warnings; warning
9244 about (?c) will warn about (?g) -- japhy */
9246 #define WASTED_O 0x01
9247 #define WASTED_G 0x02
9248 #define WASTED_C 0x04
9249 #define WASTED_GC (WASTED_G|WASTED_C)
9250 I32 wastedflags = 0x00;
9251 U32 posflags = 0, negflags = 0;
9252 U32 *flagsp = &posflags;
9253 char has_charset_modifier = '\0';
9255 bool has_use_defaults = FALSE;
9256 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9258 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9260 /* '^' as an initial flag sets certain defaults */
9261 if (UCHARAT(RExC_parse) == '^') {
9263 has_use_defaults = TRUE;
9264 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9265 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9266 ? REGEX_UNICODE_CHARSET
9267 : REGEX_DEPENDS_CHARSET);
9270 cs = get_regex_charset(RExC_flags);
9271 if (cs == REGEX_DEPENDS_CHARSET
9272 && (RExC_utf8 || RExC_uni_semantics))
9274 cs = REGEX_UNICODE_CHARSET;
9277 while (*RExC_parse) {
9278 /* && strchr("iogcmsx", *RExC_parse) */
9279 /* (?g), (?gc) and (?o) are useless here
9280 and must be globally applied -- japhy */
9281 switch (*RExC_parse) {
9283 /* Code for the imsx flags */
9284 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9286 case LOCALE_PAT_MOD:
9287 if (has_charset_modifier) {
9288 goto excess_modifier;
9290 else if (flagsp == &negflags) {
9293 cs = REGEX_LOCALE_CHARSET;
9294 has_charset_modifier = LOCALE_PAT_MOD;
9296 case UNICODE_PAT_MOD:
9297 if (has_charset_modifier) {
9298 goto excess_modifier;
9300 else if (flagsp == &negflags) {
9303 cs = REGEX_UNICODE_CHARSET;
9304 has_charset_modifier = UNICODE_PAT_MOD;
9306 case ASCII_RESTRICT_PAT_MOD:
9307 if (flagsp == &negflags) {
9310 if (has_charset_modifier) {
9311 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9312 goto excess_modifier;
9314 /* Doubled modifier implies more restricted */
9315 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9318 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9320 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9322 case DEPENDS_PAT_MOD:
9323 if (has_use_defaults) {
9324 goto fail_modifiers;
9326 else if (flagsp == &negflags) {
9329 else if (has_charset_modifier) {
9330 goto excess_modifier;
9333 /* The dual charset means unicode semantics if the
9334 * pattern (or target, not known until runtime) are
9335 * utf8, or something in the pattern indicates unicode
9337 cs = (RExC_utf8 || RExC_uni_semantics)
9338 ? REGEX_UNICODE_CHARSET
9339 : REGEX_DEPENDS_CHARSET;
9340 has_charset_modifier = DEPENDS_PAT_MOD;
9344 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9345 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9347 else if (has_charset_modifier == *(RExC_parse - 1)) {
9348 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9352 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9357 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9360 case ONCE_PAT_MOD: /* 'o' */
9361 case GLOBAL_PAT_MOD: /* 'g' */
9362 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9363 const I32 wflagbit = *RExC_parse == 'o'
9366 if (! (wastedflags & wflagbit) ) {
9367 wastedflags |= wflagbit;
9368 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9371 "Useless (%s%c) - %suse /%c modifier",
9372 flagsp == &negflags ? "?-" : "?",
9374 flagsp == &negflags ? "don't " : "",
9381 case CONTINUE_PAT_MOD: /* 'c' */
9382 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9383 if (! (wastedflags & WASTED_C) ) {
9384 wastedflags |= WASTED_GC;
9385 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9388 "Useless (%sc) - %suse /gc modifier",
9389 flagsp == &negflags ? "?-" : "?",
9390 flagsp == &negflags ? "don't " : ""
9395 case KEEPCOPY_PAT_MOD: /* 'p' */
9396 if (flagsp == &negflags) {
9398 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9400 *flagsp |= RXf_PMf_KEEPCOPY;
9404 /* A flag is a default iff it is following a minus, so
9405 * if there is a minus, it means will be trying to
9406 * re-specify a default which is an error */
9407 if (has_use_defaults || flagsp == &negflags) {
9408 goto fail_modifiers;
9411 wastedflags = 0; /* reset so (?g-c) warns twice */
9415 RExC_flags |= posflags;
9416 RExC_flags &= ~negflags;
9417 set_regex_charset(&RExC_flags, cs);
9418 if (RExC_flags & RXf_PMf_FOLD) {
9419 RExC_contains_i = 1;
9425 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9426 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9427 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9428 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9437 - reg - regular expression, i.e. main body or parenthesized thing
9439 * Caller must absorb opening parenthesis.
9441 * Combining parenthesis handling with the base level of regular expression
9442 * is a trifle forced, but the need to tie the tails of the branches to what
9443 * follows makes it hard to avoid.
9445 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9447 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9449 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9452 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9453 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9454 needs to be restarted.
9455 Otherwise would only return NULL if regbranch() returns NULL, which
9458 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9459 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9460 * 2 is like 1, but indicates that nextchar() has been called to advance
9461 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9462 * this flag alerts us to the need to check for that */
9465 regnode *ret; /* Will be the head of the group. */
9468 regnode *ender = NULL;
9471 U32 oregflags = RExC_flags;
9472 bool have_branch = 0;
9474 I32 freeze_paren = 0;
9475 I32 after_freeze = 0;
9476 I32 num; /* numeric backreferences */
9478 char * parse_start = RExC_parse; /* MJD */
9479 char * const oregcomp_parse = RExC_parse;
9481 GET_RE_DEBUG_FLAGS_DECL;
9483 PERL_ARGS_ASSERT_REG;
9484 DEBUG_PARSE("reg ");
9486 *flagp = 0; /* Tentatively. */
9489 /* Make an OPEN node, if parenthesized. */
9492 /* Under /x, space and comments can be gobbled up between the '(' and
9493 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9494 * intervening space, as the sequence is a token, and a token should be
9496 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9498 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9499 char *start_verb = RExC_parse;
9500 STRLEN verb_len = 0;
9501 char *start_arg = NULL;
9502 unsigned char op = 0;
9504 int internal_argval = 0; /* internal_argval is only useful if
9507 if (has_intervening_patws) {
9509 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9511 while ( *RExC_parse && *RExC_parse != ')' ) {
9512 if ( *RExC_parse == ':' ) {
9513 start_arg = RExC_parse + 1;
9519 verb_len = RExC_parse - start_verb;
9522 while ( *RExC_parse && *RExC_parse != ')' )
9524 if ( *RExC_parse != ')' )
9525 vFAIL("Unterminated verb pattern argument");
9526 if ( RExC_parse == start_arg )
9529 if ( *RExC_parse != ')' )
9530 vFAIL("Unterminated verb pattern");
9533 switch ( *start_verb ) {
9534 case 'A': /* (*ACCEPT) */
9535 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9537 internal_argval = RExC_nestroot;
9540 case 'C': /* (*COMMIT) */
9541 if ( memEQs(start_verb,verb_len,"COMMIT") )
9544 case 'F': /* (*FAIL) */
9545 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9550 case ':': /* (*:NAME) */
9551 case 'M': /* (*MARK:NAME) */
9552 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9557 case 'P': /* (*PRUNE) */
9558 if ( memEQs(start_verb,verb_len,"PRUNE") )
9561 case 'S': /* (*SKIP) */
9562 if ( memEQs(start_verb,verb_len,"SKIP") )
9565 case 'T': /* (*THEN) */
9566 /* [19:06] <TimToady> :: is then */
9567 if ( memEQs(start_verb,verb_len,"THEN") ) {
9569 RExC_seen |= REG_CUTGROUP_SEEN;
9574 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9576 "Unknown verb pattern '%"UTF8f"'",
9577 UTF8fARG(UTF, verb_len, start_verb));
9580 if ( start_arg && internal_argval ) {
9581 vFAIL3("Verb pattern '%.*s' may not have an argument",
9582 verb_len, start_verb);
9583 } else if ( argok < 0 && !start_arg ) {
9584 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9585 verb_len, start_verb);
9587 ret = reganode(pRExC_state, op, internal_argval);
9588 if ( ! internal_argval && ! SIZE_ONLY ) {
9590 SV *sv = newSVpvn( start_arg,
9591 RExC_parse - start_arg);
9592 ARG(ret) = add_data( pRExC_state,
9594 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9601 if (!internal_argval)
9602 RExC_seen |= REG_VERBARG_SEEN;
9603 } else if ( start_arg ) {
9604 vFAIL3("Verb pattern '%.*s' may not have an argument",
9605 verb_len, start_verb);
9607 ret = reg_node(pRExC_state, op);
9609 nextchar(pRExC_state);
9612 else if (*RExC_parse == '?') { /* (?...) */
9613 bool is_logical = 0;
9614 const char * const seqstart = RExC_parse;
9615 if (has_intervening_patws) {
9617 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9621 paren = *RExC_parse++;
9622 ret = NULL; /* For look-ahead/behind. */
9625 case 'P': /* (?P...) variants for those used to PCRE/Python */
9626 paren = *RExC_parse++;
9627 if ( paren == '<') /* (?P<...>) named capture */
9629 else if (paren == '>') { /* (?P>name) named recursion */
9630 goto named_recursion;
9632 else if (paren == '=') { /* (?P=...) named backref */
9633 /* this pretty much dupes the code for \k<NAME> in
9634 * regatom(), if you change this make sure you change that
9636 char* name_start = RExC_parse;
9638 SV *sv_dat = reg_scan_name(pRExC_state,
9639 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9640 if (RExC_parse == name_start || *RExC_parse != ')')
9641 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9642 vFAIL2("Sequence %.3s... not terminated",parse_start);
9645 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9646 RExC_rxi->data->data[num]=(void*)sv_dat;
9647 SvREFCNT_inc_simple_void(sv_dat);
9650 ret = reganode(pRExC_state,
9653 : (ASCII_FOLD_RESTRICTED)
9655 : (AT_LEAST_UNI_SEMANTICS)
9663 Set_Node_Offset(ret, parse_start+1);
9664 Set_Node_Cur_Length(ret, parse_start);
9666 nextchar(pRExC_state);
9670 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9671 vFAIL3("Sequence (%.*s...) not recognized",
9672 RExC_parse-seqstart, seqstart);
9674 case '<': /* (?<...) */
9675 if (*RExC_parse == '!')
9677 else if (*RExC_parse != '=')
9683 case '\'': /* (?'...') */
9684 name_start= RExC_parse;
9685 svname = reg_scan_name(pRExC_state,
9686 SIZE_ONLY /* reverse test from the others */
9687 ? REG_RSN_RETURN_NAME
9688 : REG_RSN_RETURN_NULL);
9689 if (RExC_parse == name_start || *RExC_parse != paren)
9690 vFAIL2("Sequence (?%c... not terminated",
9691 paren=='>' ? '<' : paren);
9695 if (!svname) /* shouldn't happen */
9697 "panic: reg_scan_name returned NULL");
9698 if (!RExC_paren_names) {
9699 RExC_paren_names= newHV();
9700 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9702 RExC_paren_name_list= newAV();
9703 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9706 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9708 sv_dat = HeVAL(he_str);
9710 /* croak baby croak */
9712 "panic: paren_name hash element allocation failed");
9713 } else if ( SvPOK(sv_dat) ) {
9714 /* (?|...) can mean we have dupes so scan to check
9715 its already been stored. Maybe a flag indicating
9716 we are inside such a construct would be useful,
9717 but the arrays are likely to be quite small, so
9718 for now we punt -- dmq */
9719 IV count = SvIV(sv_dat);
9720 I32 *pv = (I32*)SvPVX(sv_dat);
9722 for ( i = 0 ; i < count ; i++ ) {
9723 if ( pv[i] == RExC_npar ) {
9729 pv = (I32*)SvGROW(sv_dat,
9730 SvCUR(sv_dat) + sizeof(I32)+1);
9731 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9732 pv[count] = RExC_npar;
9733 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9736 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9737 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9740 SvIV_set(sv_dat, 1);
9743 /* Yes this does cause a memory leak in debugging Perls
9745 if (!av_store(RExC_paren_name_list,
9746 RExC_npar, SvREFCNT_inc(svname)))
9747 SvREFCNT_dec_NN(svname);
9750 /*sv_dump(sv_dat);*/
9752 nextchar(pRExC_state);
9754 goto capturing_parens;
9756 RExC_seen |= REG_LOOKBEHIND_SEEN;
9757 RExC_in_lookbehind++;
9760 case '=': /* (?=...) */
9761 RExC_seen_zerolen++;
9763 case '!': /* (?!...) */
9764 RExC_seen_zerolen++;
9765 if (*RExC_parse == ')') {
9766 ret=reg_node(pRExC_state, OPFAIL);
9767 nextchar(pRExC_state);
9771 case '|': /* (?|...) */
9772 /* branch reset, behave like a (?:...) except that
9773 buffers in alternations share the same numbers */
9775 after_freeze = freeze_paren = RExC_npar;
9777 case ':': /* (?:...) */
9778 case '>': /* (?>...) */
9780 case '$': /* (?$...) */
9781 case '@': /* (?@...) */
9782 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9784 case '0' : /* (?0) */
9785 case 'R' : /* (?R) */
9786 if (*RExC_parse != ')')
9787 FAIL("Sequence (?R) not terminated");
9788 ret = reg_node(pRExC_state, GOSTART);
9789 RExC_seen |= REG_GOSTART_SEEN;
9790 *flagp |= POSTPONED;
9791 nextchar(pRExC_state);
9794 /* named and numeric backreferences */
9795 case '&': /* (?&NAME) */
9796 parse_start = RExC_parse - 1;
9799 SV *sv_dat = reg_scan_name(pRExC_state,
9800 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9801 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9803 if (RExC_parse == RExC_end || *RExC_parse != ')')
9804 vFAIL("Sequence (?&... not terminated");
9805 goto gen_recurse_regop;
9806 assert(0); /* NOT REACHED */
9808 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9810 vFAIL("Illegal pattern");
9812 goto parse_recursion;
9814 case '-': /* (?-1) */
9815 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9816 RExC_parse--; /* rewind to let it be handled later */
9820 case '1': case '2': case '3': case '4': /* (?1) */
9821 case '5': case '6': case '7': case '8': case '9':
9824 num = atoi(RExC_parse);
9825 parse_start = RExC_parse - 1; /* MJD */
9826 if (*RExC_parse == '-')
9828 while (isDIGIT(*RExC_parse))
9830 if (*RExC_parse!=')')
9831 vFAIL("Expecting close bracket");
9834 if ( paren == '-' ) {
9836 Diagram of capture buffer numbering.
9837 Top line is the normal capture buffer numbers
9838 Bottom line is the negative indexing as from
9842 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9846 num = RExC_npar + num;
9849 vFAIL("Reference to nonexistent group");
9851 } else if ( paren == '+' ) {
9852 num = RExC_npar + num - 1;
9855 ret = reganode(pRExC_state, GOSUB, num);
9857 if (num > (I32)RExC_rx->nparens) {
9859 vFAIL("Reference to nonexistent group");
9861 ARG2L_SET( ret, RExC_recurse_count++);
9863 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9864 "Recurse #%"UVuf" to %"IVdf"\n",
9865 (UV)ARG(ret), (IV)ARG2L(ret)));
9869 RExC_seen |= REG_RECURSE_SEEN;
9870 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9871 Set_Node_Offset(ret, parse_start); /* MJD */
9873 *flagp |= POSTPONED;
9874 nextchar(pRExC_state);
9877 assert(0); /* NOT REACHED */
9879 case '?': /* (??...) */
9881 if (*RExC_parse != '{') {
9883 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9885 "Sequence (%"UTF8f"...) not recognized",
9886 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9889 *flagp |= POSTPONED;
9890 paren = *RExC_parse++;
9892 case '{': /* (?{...}) */
9895 struct reg_code_block *cb;
9897 RExC_seen_zerolen++;
9899 if ( !pRExC_state->num_code_blocks
9900 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9901 || pRExC_state->code_blocks[pRExC_state->code_index].start
9902 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9905 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9906 FAIL("panic: Sequence (?{...}): no code block found\n");
9907 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9909 /* this is a pre-compiled code block (?{...}) */
9910 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9911 RExC_parse = RExC_start + cb->end;
9914 if (cb->src_regex) {
9915 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9916 RExC_rxi->data->data[n] =
9917 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9918 RExC_rxi->data->data[n+1] = (void*)o;
9921 n = add_data(pRExC_state,
9922 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9923 RExC_rxi->data->data[n] = (void*)o;
9926 pRExC_state->code_index++;
9927 nextchar(pRExC_state);
9931 ret = reg_node(pRExC_state, LOGICAL);
9932 eval = reganode(pRExC_state, EVAL, n);
9935 /* for later propagation into (??{}) return value */
9936 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9938 REGTAIL(pRExC_state, ret, eval);
9939 /* deal with the length of this later - MJD */
9942 ret = reganode(pRExC_state, EVAL, n);
9943 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9944 Set_Node_Offset(ret, parse_start);
9947 case '(': /* (?(?{...})...) and (?(?=...)...) */
9950 if (RExC_parse[0] == '?') { /* (?(?...)) */
9951 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9952 || RExC_parse[1] == '<'
9953 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9957 ret = reg_node(pRExC_state, LOGICAL);
9961 tail = reg(pRExC_state, 1, &flag, depth+1);
9962 if (flag & RESTART_UTF8) {
9963 *flagp = RESTART_UTF8;
9966 REGTAIL(pRExC_state, ret, tail);
9970 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9971 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9973 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9974 char *name_start= RExC_parse++;
9976 SV *sv_dat=reg_scan_name(pRExC_state,
9977 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9978 if (RExC_parse == name_start || *RExC_parse != ch)
9979 vFAIL2("Sequence (?(%c... not terminated",
9980 (ch == '>' ? '<' : ch));
9983 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9984 RExC_rxi->data->data[num]=(void*)sv_dat;
9985 SvREFCNT_inc_simple_void(sv_dat);
9987 ret = reganode(pRExC_state,NGROUPP,num);
9988 goto insert_if_check_paren;
9990 else if (RExC_parse[0] == 'D' &&
9991 RExC_parse[1] == 'E' &&
9992 RExC_parse[2] == 'F' &&
9993 RExC_parse[3] == 'I' &&
9994 RExC_parse[4] == 'N' &&
9995 RExC_parse[5] == 'E')
9997 ret = reganode(pRExC_state,DEFINEP,0);
10000 goto insert_if_check_paren;
10002 else if (RExC_parse[0] == 'R') {
10005 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10006 parno = atoi(RExC_parse++);
10007 while (isDIGIT(*RExC_parse))
10009 } else if (RExC_parse[0] == '&') {
10012 sv_dat = reg_scan_name(pRExC_state,
10014 ? REG_RSN_RETURN_NULL
10015 : REG_RSN_RETURN_DATA);
10016 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10018 ret = reganode(pRExC_state,INSUBP,parno);
10019 goto insert_if_check_paren;
10021 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10025 parno = atoi(RExC_parse++);
10027 while (isDIGIT(*RExC_parse))
10029 ret = reganode(pRExC_state, GROUPP, parno);
10031 insert_if_check_paren:
10032 if (*(tmp = nextchar(pRExC_state)) != ')') {
10033 /* nextchar also skips comments, so undo its work
10034 * and skip over the the next character.
10037 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10038 vFAIL("Switch condition not recognized");
10041 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10042 br = regbranch(pRExC_state, &flags, 1,depth+1);
10044 if (flags & RESTART_UTF8) {
10045 *flagp = RESTART_UTF8;
10048 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10051 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10053 c = *nextchar(pRExC_state);
10054 if (flags&HASWIDTH)
10055 *flagp |= HASWIDTH;
10058 vFAIL("(?(DEFINE)....) does not allow branches");
10060 /* Fake one for optimizer. */
10061 lastbr = reganode(pRExC_state, IFTHEN, 0);
10063 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10064 if (flags & RESTART_UTF8) {
10065 *flagp = RESTART_UTF8;
10068 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10071 REGTAIL(pRExC_state, ret, lastbr);
10072 if (flags&HASWIDTH)
10073 *flagp |= HASWIDTH;
10074 c = *nextchar(pRExC_state);
10079 vFAIL("Switch (?(condition)... contains too many branches");
10080 ender = reg_node(pRExC_state, TAIL);
10081 REGTAIL(pRExC_state, br, ender);
10083 REGTAIL(pRExC_state, lastbr, ender);
10084 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10087 REGTAIL(pRExC_state, ret, ender);
10088 RExC_size++; /* XXX WHY do we need this?!!
10089 For large programs it seems to be required
10090 but I can't figure out why. -- dmq*/
10094 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10095 vFAIL("Unknown switch condition (?(...))");
10098 case '[': /* (?[ ... ]) */
10099 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10102 RExC_parse--; /* for vFAIL to print correctly */
10103 vFAIL("Sequence (? incomplete");
10105 default: /* e.g., (?i) */
10108 parse_lparen_question_flags(pRExC_state);
10109 if (UCHARAT(RExC_parse) != ':') {
10110 nextchar(pRExC_state);
10115 nextchar(pRExC_state);
10125 ret = reganode(pRExC_state, OPEN, parno);
10127 if (!RExC_nestroot)
10128 RExC_nestroot = parno;
10129 if (RExC_seen & REG_RECURSE_SEEN
10130 && !RExC_open_parens[parno-1])
10132 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10133 "Setting open paren #%"IVdf" to %d\n",
10134 (IV)parno, REG_NODE_NUM(ret)));
10135 RExC_open_parens[parno-1]= ret;
10138 Set_Node_Length(ret, 1); /* MJD */
10139 Set_Node_Offset(ret, RExC_parse); /* MJD */
10147 /* Pick up the branches, linking them together. */
10148 parse_start = RExC_parse; /* MJD */
10149 br = regbranch(pRExC_state, &flags, 1,depth+1);
10151 /* branch_len = (paren != 0); */
10154 if (flags & RESTART_UTF8) {
10155 *flagp = RESTART_UTF8;
10158 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10160 if (*RExC_parse == '|') {
10161 if (!SIZE_ONLY && RExC_extralen) {
10162 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10165 reginsert(pRExC_state, BRANCH, br, depth+1);
10166 Set_Node_Length(br, paren != 0);
10167 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10171 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10173 else if (paren == ':') {
10174 *flagp |= flags&SIMPLE;
10176 if (is_open) { /* Starts with OPEN. */
10177 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10179 else if (paren != '?') /* Not Conditional */
10181 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10183 while (*RExC_parse == '|') {
10184 if (!SIZE_ONLY && RExC_extralen) {
10185 ender = reganode(pRExC_state, LONGJMP,0);
10187 /* Append to the previous. */
10188 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10191 RExC_extralen += 2; /* Account for LONGJMP. */
10192 nextchar(pRExC_state);
10193 if (freeze_paren) {
10194 if (RExC_npar > after_freeze)
10195 after_freeze = RExC_npar;
10196 RExC_npar = freeze_paren;
10198 br = regbranch(pRExC_state, &flags, 0, depth+1);
10201 if (flags & RESTART_UTF8) {
10202 *flagp = RESTART_UTF8;
10205 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10207 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10209 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10212 if (have_branch || paren != ':') {
10213 /* Make a closing node, and hook it on the end. */
10216 ender = reg_node(pRExC_state, TAIL);
10219 ender = reganode(pRExC_state, CLOSE, parno);
10220 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10221 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10222 "Setting close paren #%"IVdf" to %d\n",
10223 (IV)parno, REG_NODE_NUM(ender)));
10224 RExC_close_parens[parno-1]= ender;
10225 if (RExC_nestroot == parno)
10228 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10229 Set_Node_Length(ender,1); /* MJD */
10235 *flagp &= ~HASWIDTH;
10238 ender = reg_node(pRExC_state, SUCCEED);
10241 ender = reg_node(pRExC_state, END);
10243 assert(!RExC_opend); /* there can only be one! */
10244 RExC_opend = ender;
10248 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10249 SV * const mysv_val1=sv_newmortal();
10250 SV * const mysv_val2=sv_newmortal();
10251 DEBUG_PARSE_MSG("lsbr");
10252 regprop(RExC_rx, mysv_val1, lastbr, NULL);
10253 regprop(RExC_rx, mysv_val2, ender, NULL);
10254 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10255 SvPV_nolen_const(mysv_val1),
10256 (IV)REG_NODE_NUM(lastbr),
10257 SvPV_nolen_const(mysv_val2),
10258 (IV)REG_NODE_NUM(ender),
10259 (IV)(ender - lastbr)
10262 REGTAIL(pRExC_state, lastbr, ender);
10264 if (have_branch && !SIZE_ONLY) {
10265 char is_nothing= 1;
10267 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10269 /* Hook the tails of the branches to the closing node. */
10270 for (br = ret; br; br = regnext(br)) {
10271 const U8 op = PL_regkind[OP(br)];
10272 if (op == BRANCH) {
10273 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10274 if ( OP(NEXTOPER(br)) != NOTHING
10275 || regnext(NEXTOPER(br)) != ender)
10278 else if (op == BRANCHJ) {
10279 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10280 /* for now we always disable this optimisation * /
10281 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10282 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10288 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10289 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10290 SV * const mysv_val1=sv_newmortal();
10291 SV * const mysv_val2=sv_newmortal();
10292 DEBUG_PARSE_MSG("NADA");
10293 regprop(RExC_rx, mysv_val1, ret, NULL);
10294 regprop(RExC_rx, mysv_val2, ender, NULL);
10295 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10296 SvPV_nolen_const(mysv_val1),
10297 (IV)REG_NODE_NUM(ret),
10298 SvPV_nolen_const(mysv_val2),
10299 (IV)REG_NODE_NUM(ender),
10304 if (OP(ender) == TAIL) {
10309 for ( opt= br + 1; opt < ender ; opt++ )
10310 OP(opt)= OPTIMIZED;
10311 NEXT_OFF(br)= ender - br;
10319 static const char parens[] = "=!<,>";
10321 if (paren && (p = strchr(parens, paren))) {
10322 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10323 int flag = (p - parens) > 1;
10326 node = SUSPEND, flag = 0;
10327 reginsert(pRExC_state, node,ret, depth+1);
10328 Set_Node_Cur_Length(ret, parse_start);
10329 Set_Node_Offset(ret, parse_start + 1);
10331 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10335 /* Check for proper termination. */
10337 /* restore original flags, but keep (?p) */
10338 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10339 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10340 RExC_parse = oregcomp_parse;
10341 vFAIL("Unmatched (");
10344 else if (!paren && RExC_parse < RExC_end) {
10345 if (*RExC_parse == ')') {
10347 vFAIL("Unmatched )");
10350 FAIL("Junk on end of regexp"); /* "Can't happen". */
10351 assert(0); /* NOTREACHED */
10354 if (RExC_in_lookbehind) {
10355 RExC_in_lookbehind--;
10357 if (after_freeze > RExC_npar)
10358 RExC_npar = after_freeze;
10363 - regbranch - one alternative of an | operator
10365 * Implements the concatenation operator.
10367 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10371 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10375 regnode *chain = NULL;
10377 I32 flags = 0, c = 0;
10378 GET_RE_DEBUG_FLAGS_DECL;
10380 PERL_ARGS_ASSERT_REGBRANCH;
10382 DEBUG_PARSE("brnc");
10387 if (!SIZE_ONLY && RExC_extralen)
10388 ret = reganode(pRExC_state, BRANCHJ,0);
10390 ret = reg_node(pRExC_state, BRANCH);
10391 Set_Node_Length(ret, 1);
10395 if (!first && SIZE_ONLY)
10396 RExC_extralen += 1; /* BRANCHJ */
10398 *flagp = WORST; /* Tentatively. */
10401 nextchar(pRExC_state);
10402 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10403 flags &= ~TRYAGAIN;
10404 latest = regpiece(pRExC_state, &flags,depth+1);
10405 if (latest == NULL) {
10406 if (flags & TRYAGAIN)
10408 if (flags & RESTART_UTF8) {
10409 *flagp = RESTART_UTF8;
10412 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10414 else if (ret == NULL)
10416 *flagp |= flags&(HASWIDTH|POSTPONED);
10417 if (chain == NULL) /* First piece. */
10418 *flagp |= flags&SPSTART;
10421 REGTAIL(pRExC_state, chain, latest);
10426 if (chain == NULL) { /* Loop ran zero times. */
10427 chain = reg_node(pRExC_state, NOTHING);
10432 *flagp |= flags&SIMPLE;
10439 - regpiece - something followed by possible [*+?]
10441 * Note that the branching code sequences used for ? and the general cases
10442 * of * and + are somewhat optimized: they use the same NOTHING node as
10443 * both the endmarker for their branch list and the body of the last branch.
10444 * It might seem that this node could be dispensed with entirely, but the
10445 * endmarker role is not redundant.
10447 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10449 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10453 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10460 const char * const origparse = RExC_parse;
10462 I32 max = REG_INFTY;
10463 #ifdef RE_TRACK_PATTERN_OFFSETS
10466 const char *maxpos = NULL;
10468 /* Save the original in case we change the emitted regop to a FAIL. */
10469 regnode * const orig_emit = RExC_emit;
10471 GET_RE_DEBUG_FLAGS_DECL;
10473 PERL_ARGS_ASSERT_REGPIECE;
10475 DEBUG_PARSE("piec");
10477 ret = regatom(pRExC_state, &flags,depth+1);
10479 if (flags & (TRYAGAIN|RESTART_UTF8))
10480 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10482 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10488 if (op == '{' && regcurly(RExC_parse)) {
10490 #ifdef RE_TRACK_PATTERN_OFFSETS
10491 parse_start = RExC_parse; /* MJD */
10493 next = RExC_parse + 1;
10494 while (isDIGIT(*next) || *next == ',') {
10495 if (*next == ',') {
10503 if (*next == '}') { /* got one */
10507 min = atoi(RExC_parse);
10508 if (*maxpos == ',')
10511 maxpos = RExC_parse;
10512 max = atoi(maxpos);
10513 if (!max && *maxpos != '0')
10514 max = REG_INFTY; /* meaning "infinity" */
10515 else if (max >= REG_INFTY)
10516 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10518 nextchar(pRExC_state);
10519 if (max < min) { /* If can't match, warn and optimize to fail
10522 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10524 /* We can't back off the size because we have to reserve
10525 * enough space for all the things we are about to throw
10526 * away, but we can shrink it by the ammount we are about
10527 * to re-use here */
10528 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10531 RExC_emit = orig_emit;
10533 ret = reg_node(pRExC_state, OPFAIL);
10536 else if (min == max
10537 && RExC_parse < RExC_end
10538 && (*RExC_parse == '?' || *RExC_parse == '+'))
10541 ckWARN2reg(RExC_parse + 1,
10542 "Useless use of greediness modifier '%c'",
10545 /* Absorb the modifier, so later code doesn't see nor use
10547 nextchar(pRExC_state);
10551 if ((flags&SIMPLE)) {
10552 RExC_naughty += 2 + RExC_naughty / 2;
10553 reginsert(pRExC_state, CURLY, ret, depth+1);
10554 Set_Node_Offset(ret, parse_start+1); /* MJD */
10555 Set_Node_Cur_Length(ret, parse_start);
10558 regnode * const w = reg_node(pRExC_state, WHILEM);
10561 REGTAIL(pRExC_state, ret, w);
10562 if (!SIZE_ONLY && RExC_extralen) {
10563 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10564 reginsert(pRExC_state, NOTHING,ret, depth+1);
10565 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10567 reginsert(pRExC_state, CURLYX,ret, depth+1);
10569 Set_Node_Offset(ret, parse_start+1);
10570 Set_Node_Length(ret,
10571 op == '{' ? (RExC_parse - parse_start) : 1);
10573 if (!SIZE_ONLY && RExC_extralen)
10574 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10575 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10577 RExC_whilem_seen++, RExC_extralen += 3;
10578 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10585 *flagp |= HASWIDTH;
10587 ARG1_SET(ret, (U16)min);
10588 ARG2_SET(ret, (U16)max);
10590 if (max == REG_INFTY)
10591 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10597 if (!ISMULT1(op)) {
10602 #if 0 /* Now runtime fix should be reliable. */
10604 /* if this is reinstated, don't forget to put this back into perldiag:
10606 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10608 (F) The part of the regexp subject to either the * or + quantifier
10609 could match an empty string. The {#} shows in the regular
10610 expression about where the problem was discovered.
10614 if (!(flags&HASWIDTH) && op != '?')
10615 vFAIL("Regexp *+ operand could be empty");
10618 #ifdef RE_TRACK_PATTERN_OFFSETS
10619 parse_start = RExC_parse;
10621 nextchar(pRExC_state);
10623 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10625 if (op == '*' && (flags&SIMPLE)) {
10626 reginsert(pRExC_state, STAR, ret, depth+1);
10629 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10631 else if (op == '*') {
10635 else if (op == '+' && (flags&SIMPLE)) {
10636 reginsert(pRExC_state, PLUS, ret, depth+1);
10639 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10641 else if (op == '+') {
10645 else if (op == '?') {
10650 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10651 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10652 ckWARN2reg(RExC_parse,
10653 "%"UTF8f" matches null string many times",
10654 UTF8fARG(UTF, (RExC_parse >= origparse
10655 ? RExC_parse - origparse
10658 (void)ReREFCNT_inc(RExC_rx_sv);
10661 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10662 nextchar(pRExC_state);
10663 reginsert(pRExC_state, MINMOD, ret, depth+1);
10664 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10667 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10669 nextchar(pRExC_state);
10670 ender = reg_node(pRExC_state, SUCCEED);
10671 REGTAIL(pRExC_state, ret, ender);
10672 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10674 ender = reg_node(pRExC_state, TAIL);
10675 REGTAIL(pRExC_state, ret, ender);
10678 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10680 vFAIL("Nested quantifiers");
10687 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10688 UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10689 const bool strict /* Apply stricter parsing rules? */
10693 /* This is expected to be called by a parser routine that has recognized '\N'
10694 and needs to handle the rest. RExC_parse is expected to point at the first
10695 char following the N at the time of the call. On successful return,
10696 RExC_parse has been updated to point to just after the sequence identified
10697 by this routine, and <*flagp> has been updated.
10699 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10702 \N may begin either a named sequence, or if outside a character class, mean
10703 to match a non-newline. For non single-quoted regexes, the tokenizer has
10704 attempted to decide which, and in the case of a named sequence, converted it
10705 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10706 where c1... are the characters in the sequence. For single-quoted regexes,
10707 the tokenizer passes the \N sequence through unchanged; this code will not
10708 attempt to determine this nor expand those, instead raising a syntax error.
10709 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10710 or there is no '}', it signals that this \N occurrence means to match a
10713 Only the \N{U+...} form should occur in a character class, for the same
10714 reason that '.' inside a character class means to just match a period: it
10715 just doesn't make sense.
10717 The function raises an error (via vFAIL), and doesn't return for various
10718 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10719 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10720 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10721 only possible if node_p is non-NULL.
10724 If <valuep> is non-null, it means the caller can accept an input sequence
10725 consisting of a just a single code point; <*valuep> is set to that value
10726 if the input is such.
10728 If <node_p> is non-null it signifies that the caller can accept any other
10729 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10731 1) \N means not-a-NL: points to a newly created REG_ANY node;
10732 2) \N{}: points to a new NOTHING node;
10733 3) otherwise: points to a new EXACT node containing the resolved
10735 Note that FALSE is returned for single code point sequences if <valuep> is
10739 char * endbrace; /* '}' following the name */
10741 char *endchar; /* Points to '.' or '}' ending cur char in the input
10743 bool has_multiple_chars; /* true if the input stream contains a sequence of
10744 more than one character */
10746 GET_RE_DEBUG_FLAGS_DECL;
10748 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10750 GET_RE_DEBUG_FLAGS;
10752 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10754 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10755 * modifier. The other meaning does not, so use a temporary until we find
10756 * out which we are being called with */
10757 p = (RExC_flags & RXf_PMf_EXTENDED)
10758 ? regpatws(pRExC_state, RExC_parse,
10759 TRUE) /* means recognize comments */
10762 /* Disambiguate between \N meaning a named character versus \N meaning
10763 * [^\n]. The former is assumed when it can't be the latter. */
10764 if (*p != '{' || regcurly(p)) {
10767 /* no bare \N allowed in a charclass */
10768 if (in_char_class) {
10769 vFAIL("\\N in a character class must be a named character: \\N{...}");
10773 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10775 nextchar(pRExC_state);
10776 *node_p = reg_node(pRExC_state, REG_ANY);
10777 *flagp |= HASWIDTH|SIMPLE;
10779 Set_Node_Length(*node_p, 1); /* MJD */
10783 /* Here, we have decided it should be a named character or sequence */
10785 /* The test above made sure that the next real character is a '{', but
10786 * under the /x modifier, it could be separated by space (or a comment and
10787 * \n) and this is not allowed (for consistency with \x{...} and the
10788 * tokenizer handling of \N{NAME}). */
10789 if (*RExC_parse != '{') {
10790 vFAIL("Missing braces on \\N{}");
10793 RExC_parse++; /* Skip past the '{' */
10795 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10796 || ! (endbrace == RExC_parse /* nothing between the {} */
10797 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10799 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10802 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10803 vFAIL("\\N{NAME} must be resolved by the lexer");
10806 if (endbrace == RExC_parse) { /* empty: \N{} */
10809 *node_p = reg_node(pRExC_state,NOTHING);
10811 else if (in_char_class) {
10812 if (SIZE_ONLY && in_char_class) {
10814 RExC_parse++; /* Position after the "}" */
10815 vFAIL("Zero length \\N{}");
10818 ckWARNreg(RExC_parse,
10819 "Ignoring zero length \\N{} in character class");
10827 nextchar(pRExC_state);
10831 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10832 RExC_parse += 2; /* Skip past the 'U+' */
10834 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10836 /* Code points are separated by dots. If none, there is only one code
10837 * point, and is terminated by the brace */
10838 has_multiple_chars = (endchar < endbrace);
10840 if (valuep && (! has_multiple_chars || in_char_class)) {
10841 /* We only pay attention to the first char of
10842 multichar strings being returned in char classes. I kinda wonder
10843 if this makes sense as it does change the behaviour
10844 from earlier versions, OTOH that behaviour was broken
10845 as well. XXX Solution is to recharacterize as
10846 [rest-of-class]|multi1|multi2... */
10848 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10849 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10850 | PERL_SCAN_DISALLOW_PREFIX
10851 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10853 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10855 /* The tokenizer should have guaranteed validity, but it's possible to
10856 * bypass it by using single quoting, so check */
10857 if (length_of_hex == 0
10858 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10860 RExC_parse += length_of_hex; /* Includes all the valid */
10861 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10862 ? UTF8SKIP(RExC_parse)
10864 /* Guard against malformed utf8 */
10865 if (RExC_parse >= endchar) {
10866 RExC_parse = endchar;
10868 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10871 if (in_char_class && has_multiple_chars) {
10873 RExC_parse = endbrace;
10874 vFAIL("\\N{} in character class restricted to one character");
10877 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10881 RExC_parse = endbrace + 1;
10883 else if (! node_p || ! has_multiple_chars) {
10885 /* Here, the input is legal, but not according to the caller's
10886 * options. We fail without advancing the parse, so that the
10887 * caller can try again */
10893 /* What is done here is to convert this to a sub-pattern of the form
10894 * (?:\x{char1}\x{char2}...)
10895 * and then call reg recursively. That way, it retains its atomicness,
10896 * while not having to worry about special handling that some code
10897 * points may have. toke.c has converted the original Unicode values
10898 * to native, so that we can just pass on the hex values unchanged. We
10899 * do have to set a flag to keep recoding from happening in the
10902 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10904 char *orig_end = RExC_end;
10907 while (RExC_parse < endbrace) {
10909 /* Convert to notation the rest of the code understands */
10910 sv_catpv(substitute_parse, "\\x{");
10911 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10912 sv_catpv(substitute_parse, "}");
10914 /* Point to the beginning of the next character in the sequence. */
10915 RExC_parse = endchar + 1;
10916 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10918 sv_catpv(substitute_parse, ")");
10920 RExC_parse = SvPV(substitute_parse, len);
10922 /* Don't allow empty number */
10924 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10926 RExC_end = RExC_parse + len;
10928 /* The values are Unicode, and therefore not subject to recoding */
10929 RExC_override_recoding = 1;
10931 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10932 if (flags & RESTART_UTF8) {
10933 *flagp = RESTART_UTF8;
10936 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10939 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10941 RExC_parse = endbrace;
10942 RExC_end = orig_end;
10943 RExC_override_recoding = 0;
10945 nextchar(pRExC_state);
10955 * It returns the code point in utf8 for the value in *encp.
10956 * value: a code value in the source encoding
10957 * encp: a pointer to an Encode object
10959 * If the result from Encode is not a single character,
10960 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10963 S_reg_recode(pTHX_ const char value, SV **encp)
10966 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10967 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10968 const STRLEN newlen = SvCUR(sv);
10969 UV uv = UNICODE_REPLACEMENT;
10971 PERL_ARGS_ASSERT_REG_RECODE;
10975 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10978 if (!newlen || numlen != newlen) {
10979 uv = UNICODE_REPLACEMENT;
10985 PERL_STATIC_INLINE U8
10986 S_compute_EXACTish(RExC_state_t *pRExC_state)
10990 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10996 op = get_regex_charset(RExC_flags);
10997 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
10998 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
10999 been, so there is no hole */
11002 return op + EXACTF;
11005 PERL_STATIC_INLINE void
11006 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11007 regnode *node, I32* flagp, STRLEN len, UV code_point,
11010 /* This knows the details about sizing an EXACTish node, setting flags for
11011 * it (by setting <*flagp>, and potentially populating it with a single
11014 * If <len> (the length in bytes) is non-zero, this function assumes that
11015 * the node has already been populated, and just does the sizing. In this
11016 * case <code_point> should be the final code point that has already been
11017 * placed into the node. This value will be ignored except that under some
11018 * circumstances <*flagp> is set based on it.
11020 * If <len> is zero, the function assumes that the node is to contain only
11021 * the single character given by <code_point> and calculates what <len>
11022 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11023 * additionally will populate the node's STRING with <code_point> or its
11026 * In both cases <*flagp> is appropriately set
11028 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11029 * 255, must be folded (the former only when the rules indicate it can
11032 * When it does the populating, it looks at the flag 'downgradable'. If
11033 * true with a node that folds, it checks if the single code point
11034 * participates in a fold, and if not downgrades the node to an EXACT.
11035 * This helps the optimizer */
11037 bool len_passed_in = cBOOL(len != 0);
11038 U8 character[UTF8_MAXBYTES_CASE+1];
11040 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11042 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11043 * sizing difference, and is extra work that is thrown away */
11044 if (downgradable && ! PASS2) {
11045 downgradable = FALSE;
11048 if (! len_passed_in) {
11050 if (UNI_IS_INVARIANT(code_point)) {
11051 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11052 *character = (U8) code_point;
11054 else { /* Here is /i and not /l (toFOLD() is defined on just
11055 ASCII, which isn't the same thing as INVARIANT on
11056 EBCDIC, but it works there, as the extra invariants
11057 fold to themselves) */
11058 *character = toFOLD((U8) code_point);
11060 && *character == code_point
11061 && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11068 else if (FOLD && (! LOC
11069 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11070 { /* Folding, and ok to do so now */
11071 UV folded = _to_uni_fold_flags(
11075 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11076 ? FOLD_FLAGS_NOMIX_ASCII
11079 && folded == code_point
11080 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11085 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11087 /* Not folding this cp, and can output it directly */
11088 *character = UTF8_TWO_BYTE_HI(code_point);
11089 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11093 uvchr_to_utf8( character, code_point);
11094 len = UTF8SKIP(character);
11096 } /* Else pattern isn't UTF8. */
11098 *character = (U8) code_point;
11100 } /* Else is folded non-UTF8 */
11101 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11103 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11104 * comments at join_exact()); */
11105 *character = (U8) code_point;
11108 /* Can turn into an EXACT node if we know the fold at compile time,
11109 * and it folds to itself and doesn't particpate in other folds */
11112 && PL_fold_latin1[code_point] == code_point
11113 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11114 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11118 } /* else is Sharp s. May need to fold it */
11119 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11121 *(character + 1) = 's';
11125 *character = LATIN_SMALL_LETTER_SHARP_S;
11131 RExC_size += STR_SZ(len);
11134 RExC_emit += STR_SZ(len);
11135 STR_LEN(node) = len;
11136 if (! len_passed_in) {
11137 Copy((char *) character, STRING(node), len, char);
11141 *flagp |= HASWIDTH;
11143 /* A single character node is SIMPLE, except for the special-cased SHARP S
11145 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11146 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11147 || ! FOLD || ! DEPENDS_SEMANTICS))
11152 /* The OP may not be well defined in PASS1 */
11153 if (PASS2 && OP(node) == EXACTFL) {
11154 RExC_contains_locale = 1;
11159 /* return atoi(p), unless it's too big to sensibly be a backref,
11160 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11163 S_backref_value(char *p)
11167 for (;isDIGIT(*q); q++) {} /* calculate length of num */
11168 if (q - p == 0 || q - p > 9)
11175 - regatom - the lowest level
11177 Try to identify anything special at the start of the pattern. If there
11178 is, then handle it as required. This may involve generating a single regop,
11179 such as for an assertion; or it may involve recursing, such as to
11180 handle a () structure.
11182 If the string doesn't start with something special then we gobble up
11183 as much literal text as we can.
11185 Once we have been able to handle whatever type of thing started the
11186 sequence, we return.
11188 Note: we have to be careful with escapes, as they can be both literal
11189 and special, and in the case of \10 and friends, context determines which.
11191 A summary of the code structure is:
11193 switch (first_byte) {
11194 cases for each special:
11195 handle this special;
11198 switch (2nd byte) {
11199 cases for each unambiguous special:
11200 handle this special;
11202 cases for each ambigous special/literal:
11204 if (special) handle here
11206 default: // unambiguously literal:
11209 default: // is a literal char
11212 create EXACTish node for literal;
11213 while (more input and node isn't full) {
11214 switch (input_byte) {
11215 cases for each special;
11216 make sure parse pointer is set so that the next call to
11217 regatom will see this special first
11218 goto loopdone; // EXACTish node terminated by prev. char
11220 append char to EXACTISH node;
11222 get next input byte;
11226 return the generated node;
11228 Specifically there are two separate switches for handling
11229 escape sequences, with the one for handling literal escapes requiring
11230 a dummy entry for all of the special escapes that are actually handled
11233 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11235 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11237 Otherwise does not return NULL.
11241 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11244 regnode *ret = NULL;
11246 char *parse_start = RExC_parse;
11251 GET_RE_DEBUG_FLAGS_DECL;
11253 *flagp = WORST; /* Tentatively. */
11255 DEBUG_PARSE("atom");
11257 PERL_ARGS_ASSERT_REGATOM;
11260 switch ((U8)*RExC_parse) {
11262 RExC_seen_zerolen++;
11263 nextchar(pRExC_state);
11264 if (RExC_flags & RXf_PMf_MULTILINE)
11265 ret = reg_node(pRExC_state, MBOL);
11266 else if (RExC_flags & RXf_PMf_SINGLELINE)
11267 ret = reg_node(pRExC_state, SBOL);
11269 ret = reg_node(pRExC_state, BOL);
11270 Set_Node_Length(ret, 1); /* MJD */
11273 nextchar(pRExC_state);
11275 RExC_seen_zerolen++;
11276 if (RExC_flags & RXf_PMf_MULTILINE)
11277 ret = reg_node(pRExC_state, MEOL);
11278 else if (RExC_flags & RXf_PMf_SINGLELINE)
11279 ret = reg_node(pRExC_state, SEOL);
11281 ret = reg_node(pRExC_state, EOL);
11282 Set_Node_Length(ret, 1); /* MJD */
11285 nextchar(pRExC_state);
11286 if (RExC_flags & RXf_PMf_SINGLELINE)
11287 ret = reg_node(pRExC_state, SANY);
11289 ret = reg_node(pRExC_state, REG_ANY);
11290 *flagp |= HASWIDTH|SIMPLE;
11292 Set_Node_Length(ret, 1); /* MJD */
11296 char * const oregcomp_parse = ++RExC_parse;
11297 ret = regclass(pRExC_state, flagp,depth+1,
11298 FALSE, /* means parse the whole char class */
11299 TRUE, /* allow multi-char folds */
11300 FALSE, /* don't silence non-portable warnings. */
11302 if (*RExC_parse != ']') {
11303 RExC_parse = oregcomp_parse;
11304 vFAIL("Unmatched [");
11307 if (*flagp & RESTART_UTF8)
11309 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11312 nextchar(pRExC_state);
11313 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11317 nextchar(pRExC_state);
11318 ret = reg(pRExC_state, 2, &flags,depth+1);
11320 if (flags & TRYAGAIN) {
11321 if (RExC_parse == RExC_end) {
11322 /* Make parent create an empty node if needed. */
11323 *flagp |= TRYAGAIN;
11328 if (flags & RESTART_UTF8) {
11329 *flagp = RESTART_UTF8;
11332 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11335 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11339 if (flags & TRYAGAIN) {
11340 *flagp |= TRYAGAIN;
11343 vFAIL("Internal urp");
11344 /* Supposed to be caught earlier. */
11350 vFAIL("Quantifier follows nothing");
11355 This switch handles escape sequences that resolve to some kind
11356 of special regop and not to literal text. Escape sequnces that
11357 resolve to literal text are handled below in the switch marked
11360 Every entry in this switch *must* have a corresponding entry
11361 in the literal escape switch. However, the opposite is not
11362 required, as the default for this switch is to jump to the
11363 literal text handling code.
11365 switch ((U8)*++RExC_parse) {
11366 /* Special Escapes */
11368 RExC_seen_zerolen++;
11369 ret = reg_node(pRExC_state, SBOL);
11371 goto finish_meta_pat;
11373 ret = reg_node(pRExC_state, GPOS);
11374 RExC_seen |= REG_GPOS_SEEN;
11376 goto finish_meta_pat;
11378 RExC_seen_zerolen++;
11379 ret = reg_node(pRExC_state, KEEPS);
11381 /* XXX:dmq : disabling in-place substitution seems to
11382 * be necessary here to avoid cases of memory corruption, as
11383 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11385 RExC_seen |= REG_LOOKBEHIND_SEEN;
11386 goto finish_meta_pat;
11388 ret = reg_node(pRExC_state, SEOL);
11390 RExC_seen_zerolen++; /* Do not optimize RE away */
11391 goto finish_meta_pat;
11393 ret = reg_node(pRExC_state, EOS);
11395 RExC_seen_zerolen++; /* Do not optimize RE away */
11396 goto finish_meta_pat;
11398 ret = reg_node(pRExC_state, CANY);
11399 RExC_seen |= REG_CANY_SEEN;
11400 *flagp |= HASWIDTH|SIMPLE;
11401 goto finish_meta_pat;
11403 ret = reg_node(pRExC_state, CLUMP);
11404 *flagp |= HASWIDTH;
11405 goto finish_meta_pat;
11411 arg = ANYOF_WORDCHAR;
11415 RExC_seen_zerolen++;
11416 RExC_seen |= REG_LOOKBEHIND_SEEN;
11417 op = BOUND + get_regex_charset(RExC_flags);
11418 if (op > BOUNDA) { /* /aa is same as /a */
11421 else if (op == BOUNDL) {
11422 RExC_contains_locale = 1;
11424 ret = reg_node(pRExC_state, op);
11425 FLAGS(ret) = get_regex_charset(RExC_flags);
11427 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11428 /* diag_listed_as: Use "%s" instead of "%s" */
11429 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11431 goto finish_meta_pat;
11433 RExC_seen_zerolen++;
11434 RExC_seen |= REG_LOOKBEHIND_SEEN;
11435 op = NBOUND + get_regex_charset(RExC_flags);
11436 if (op > NBOUNDA) { /* /aa is same as /a */
11439 else if (op == NBOUNDL) {
11440 RExC_contains_locale = 1;
11442 ret = reg_node(pRExC_state, op);
11443 FLAGS(ret) = get_regex_charset(RExC_flags);
11445 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11446 /* diag_listed_as: Use "%s" instead of "%s" */
11447 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11449 goto finish_meta_pat;
11459 ret = reg_node(pRExC_state, LNBREAK);
11460 *flagp |= HASWIDTH|SIMPLE;
11461 goto finish_meta_pat;
11469 goto join_posix_op_known;
11475 arg = ANYOF_VERTWS;
11477 goto join_posix_op_known;
11487 op = POSIXD + get_regex_charset(RExC_flags);
11488 if (op > POSIXA) { /* /aa is same as /a */
11491 else if (op == POSIXL) {
11492 RExC_contains_locale = 1;
11495 join_posix_op_known:
11498 op += NPOSIXD - POSIXD;
11501 ret = reg_node(pRExC_state, op);
11503 FLAGS(ret) = namedclass_to_classnum(arg);
11506 *flagp |= HASWIDTH|SIMPLE;
11510 nextchar(pRExC_state);
11511 Set_Node_Length(ret, 2); /* MJD */
11517 char* parse_start = RExC_parse - 2;
11522 ret = regclass(pRExC_state, flagp,depth+1,
11523 TRUE, /* means just parse this element */
11524 FALSE, /* don't allow multi-char folds */
11525 FALSE, /* don't silence non-portable warnings.
11526 It would be a bug if these returned
11529 /* regclass() can only return RESTART_UTF8 if multi-char folds
11532 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11537 Set_Node_Offset(ret, parse_start + 2);
11538 Set_Node_Cur_Length(ret, parse_start);
11539 nextchar(pRExC_state);
11543 /* Handle \N and \N{NAME} with multiple code points here and not
11544 * below because it can be multicharacter. join_exact() will join
11545 * them up later on. Also this makes sure that things like
11546 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11547 * The options to the grok function call causes it to fail if the
11548 * sequence is just a single code point. We then go treat it as
11549 * just another character in the current EXACT node, and hence it
11550 * gets uniform treatment with all the other characters. The
11551 * special treatment for quantifiers is not needed for such single
11552 * character sequences */
11554 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11555 FALSE /* not strict */ )) {
11556 if (*flagp & RESTART_UTF8)
11562 case 'k': /* Handle \k<NAME> and \k'NAME' */
11565 char ch= RExC_parse[1];
11566 if (ch != '<' && ch != '\'' && ch != '{') {
11568 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11569 vFAIL2("Sequence %.2s... not terminated",parse_start);
11571 /* this pretty much dupes the code for (?P=...) in reg(), if
11572 you change this make sure you change that */
11573 char* name_start = (RExC_parse += 2);
11575 SV *sv_dat = reg_scan_name(pRExC_state,
11576 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11577 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11578 if (RExC_parse == name_start || *RExC_parse != ch)
11579 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11580 vFAIL2("Sequence %.3s... not terminated",parse_start);
11583 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11584 RExC_rxi->data->data[num]=(void*)sv_dat;
11585 SvREFCNT_inc_simple_void(sv_dat);
11589 ret = reganode(pRExC_state,
11592 : (ASCII_FOLD_RESTRICTED)
11594 : (AT_LEAST_UNI_SEMANTICS)
11600 *flagp |= HASWIDTH;
11602 /* override incorrect value set in reganode MJD */
11603 Set_Node_Offset(ret, parse_start+1);
11604 Set_Node_Cur_Length(ret, parse_start);
11605 nextchar(pRExC_state);
11611 case '1': case '2': case '3': case '4':
11612 case '5': case '6': case '7': case '8': case '9':
11617 if (*RExC_parse == 'g') {
11621 if (*RExC_parse == '{') {
11625 if (*RExC_parse == '-') {
11629 if (hasbrace && !isDIGIT(*RExC_parse)) {
11630 if (isrel) RExC_parse--;
11632 goto parse_named_seq;
11635 num = S_backref_value(RExC_parse);
11637 vFAIL("Reference to invalid group 0");
11638 else if (num == I32_MAX) {
11639 if (isDIGIT(*RExC_parse))
11640 vFAIL("Reference to nonexistent group");
11642 vFAIL("Unterminated \\g... pattern");
11646 num = RExC_npar - num;
11648 vFAIL("Reference to nonexistent or unclosed group");
11652 num = S_backref_value(RExC_parse);
11653 /* bare \NNN might be backref or octal - if it is larger than or equal
11654 * RExC_npar then it is assumed to be and octal escape.
11655 * Note RExC_npar is +1 from the actual number of parens*/
11656 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11657 && *RExC_parse != '8' && *RExC_parse != '9'))
11659 /* Probably a character specified in octal, e.g. \35 */
11664 /* at this point RExC_parse definitely points to a backref
11667 #ifdef RE_TRACK_PATTERN_OFFSETS
11668 char * const parse_start = RExC_parse - 1; /* MJD */
11670 while (isDIGIT(*RExC_parse))
11673 if (*RExC_parse != '}')
11674 vFAIL("Unterminated \\g{...} pattern");
11678 if (num > (I32)RExC_rx->nparens)
11679 vFAIL("Reference to nonexistent group");
11682 ret = reganode(pRExC_state,
11685 : (ASCII_FOLD_RESTRICTED)
11687 : (AT_LEAST_UNI_SEMANTICS)
11693 *flagp |= HASWIDTH;
11695 /* override incorrect value set in reganode MJD */
11696 Set_Node_Offset(ret, parse_start+1);
11697 Set_Node_Cur_Length(ret, parse_start);
11699 nextchar(pRExC_state);
11704 if (RExC_parse >= RExC_end)
11705 FAIL("Trailing \\");
11708 /* Do not generate "unrecognized" warnings here, we fall
11709 back into the quick-grab loop below */
11716 if (RExC_flags & RXf_PMf_EXTENDED) {
11717 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11718 if (RExC_parse < RExC_end)
11725 parse_start = RExC_parse - 1;
11734 #define MAX_NODE_STRING_SIZE 127
11735 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11737 U8 upper_parse = MAX_NODE_STRING_SIZE;
11738 U8 node_type = compute_EXACTish(pRExC_state);
11739 bool next_is_quantifier;
11740 char * oldp = NULL;
11742 /* We can convert EXACTF nodes to EXACTFU if they contain only
11743 * characters that match identically regardless of the target
11744 * string's UTF8ness. The reason to do this is that EXACTF is not
11745 * trie-able, EXACTFU is.
11747 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11748 * contain only above-Latin1 characters (hence must be in UTF8),
11749 * which don't participate in folds with Latin1-range characters,
11750 * as the latter's folds aren't known until runtime. (We don't
11751 * need to figure this out until pass 2) */
11752 bool maybe_exactfu = PASS2
11753 && (node_type == EXACTF || node_type == EXACTFL);
11755 /* If a folding node contains only code points that don't
11756 * participate in folds, it can be changed into an EXACT node,
11757 * which allows the optimizer more things to look for */
11760 ret = reg_node(pRExC_state, node_type);
11762 /* In pass1, folded, we use a temporary buffer instead of the
11763 * actual node, as the node doesn't exist yet */
11764 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11770 /* We do the EXACTFish to EXACT node only if folding. (And we
11771 * don't need to figure this out until pass 2) */
11772 maybe_exact = FOLD && PASS2;
11774 /* XXX The node can hold up to 255 bytes, yet this only goes to
11775 * 127. I (khw) do not know why. Keeping it somewhat less than
11776 * 255 allows us to not have to worry about overflow due to
11777 * converting to utf8 and fold expansion, but that value is
11778 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11779 * split up by this limit into a single one using the real max of
11780 * 255. Even at 127, this breaks under rare circumstances. If
11781 * folding, we do not want to split a node at a character that is a
11782 * non-final in a multi-char fold, as an input string could just
11783 * happen to want to match across the node boundary. The join
11784 * would solve that problem if the join actually happens. But a
11785 * series of more than two nodes in a row each of 127 would cause
11786 * the first join to succeed to get to 254, but then there wouldn't
11787 * be room for the next one, which could at be one of those split
11788 * multi-char folds. I don't know of any fool-proof solution. One
11789 * could back off to end with only a code point that isn't such a
11790 * non-final, but it is possible for there not to be any in the
11792 for (p = RExC_parse - 1;
11793 len < upper_parse && p < RExC_end;
11798 if (RExC_flags & RXf_PMf_EXTENDED)
11799 p = regpatws(pRExC_state, p,
11800 TRUE); /* means recognize comments */
11811 /* Literal Escapes Switch
11813 This switch is meant to handle escape sequences that
11814 resolve to a literal character.
11816 Every escape sequence that represents something
11817 else, like an assertion or a char class, is handled
11818 in the switch marked 'Special Escapes' above in this
11819 routine, but also has an entry here as anything that
11820 isn't explicitly mentioned here will be treated as
11821 an unescaped equivalent literal.
11824 switch ((U8)*++p) {
11825 /* These are all the special escapes. */
11826 case 'A': /* Start assertion */
11827 case 'b': case 'B': /* Word-boundary assertion*/
11828 case 'C': /* Single char !DANGEROUS! */
11829 case 'd': case 'D': /* digit class */
11830 case 'g': case 'G': /* generic-backref, pos assertion */
11831 case 'h': case 'H': /* HORIZWS */
11832 case 'k': case 'K': /* named backref, keep marker */
11833 case 'p': case 'P': /* Unicode property */
11834 case 'R': /* LNBREAK */
11835 case 's': case 'S': /* space class */
11836 case 'v': case 'V': /* VERTWS */
11837 case 'w': case 'W': /* word class */
11838 case 'X': /* eXtended Unicode "combining
11839 character sequence" */
11840 case 'z': case 'Z': /* End of line/string assertion */
11844 /* Anything after here is an escape that resolves to a
11845 literal. (Except digits, which may or may not)
11851 case 'N': /* Handle a single-code point named character. */
11852 /* The options cause it to fail if a multiple code
11853 * point sequence. Handle those in the switch() above
11855 RExC_parse = p + 1;
11856 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11857 flagp, depth, FALSE,
11858 FALSE /* not strict */ ))
11860 if (*flagp & RESTART_UTF8)
11861 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11862 RExC_parse = p = oldp;
11866 if (ender > 0xff) {
11883 ender = ASCII_TO_NATIVE('\033');
11893 const char* error_msg;
11895 bool valid = grok_bslash_o(&p,
11898 TRUE, /* out warnings */
11899 FALSE, /* not strict */
11900 TRUE, /* Output warnings
11905 RExC_parse = p; /* going to die anyway; point
11906 to exact spot of failure */
11910 if (PL_encoding && ender < 0x100) {
11911 goto recode_encoding;
11913 if (ender > 0xff) {
11920 UV result = UV_MAX; /* initialize to erroneous
11922 const char* error_msg;
11924 bool valid = grok_bslash_x(&p,
11927 TRUE, /* out warnings */
11928 FALSE, /* not strict */
11929 TRUE, /* Output warnings
11934 RExC_parse = p; /* going to die anyway; point
11935 to exact spot of failure */
11940 if (PL_encoding && ender < 0x100) {
11941 goto recode_encoding;
11943 if (ender > 0xff) {
11950 ender = grok_bslash_c(*p++, SIZE_ONLY);
11952 case '8': case '9': /* must be a backreference */
11955 case '1': case '2': case '3':case '4':
11956 case '5': case '6': case '7':
11957 /* When we parse backslash escapes there is ambiguity
11958 * between backreferences and octal escapes. Any escape
11959 * from \1 - \9 is a backreference, any multi-digit
11960 * escape which does not start with 0 and which when
11961 * evaluated as decimal could refer to an already
11962 * parsed capture buffer is a backslash. Anything else
11965 * Note this implies that \118 could be interpreted as
11966 * 118 OR as "\11" . "8" depending on whether there
11967 * were 118 capture buffers defined already in the
11970 /* NOTE, RExC_npar is 1 more than the actual number of
11971 * parens we have seen so far, hence the < RExC_npar below. */
11973 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11974 { /* Not to be treated as an octal constant, go
11982 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11984 ender = grok_oct(p, &numlen, &flags, NULL);
11985 if (ender > 0xff) {
11989 if (SIZE_ONLY /* like \08, \178 */
11992 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11994 reg_warn_non_literal_string(
11996 form_short_octal_warning(p, numlen));
11999 if (PL_encoding && ender < 0x100)
12000 goto recode_encoding;
12003 if (! RExC_override_recoding) {
12004 SV* enc = PL_encoding;
12005 ender = reg_recode((const char)(U8)ender, &enc);
12006 if (!enc && SIZE_ONLY)
12007 ckWARNreg(p, "Invalid escape in the specified encoding");
12013 FAIL("Trailing \\");
12016 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12017 /* Include any { following the alpha to emphasize
12018 * that it could be part of an escape at some point
12020 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12021 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12023 goto normal_default;
12024 } /* End of switch on '\' */
12027 /* Currently we don't warn when the lbrace is at the start
12028 * of a construct. This catches it in the middle of a
12029 * literal string, or when its the first thing after
12030 * something like "\b" */
12032 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12034 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12037 default: /* A literal character */
12039 if (UTF8_IS_START(*p) && UTF) {
12041 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12042 &numlen, UTF8_ALLOW_DEFAULT);
12048 } /* End of switch on the literal */
12050 /* Here, have looked at the literal character and <ender>
12051 * contains its ordinal, <p> points to the character after it
12054 if ( RExC_flags & RXf_PMf_EXTENDED)
12055 p = regpatws(pRExC_state, p,
12056 TRUE); /* means recognize comments */
12058 /* If the next thing is a quantifier, it applies to this
12059 * character only, which means that this character has to be in
12060 * its own node and can't just be appended to the string in an
12061 * existing node, so if there are already other characters in
12062 * the node, close the node with just them, and set up to do
12063 * this character again next time through, when it will be the
12064 * only thing in its new node */
12065 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12071 if (! FOLD /* The simple case, just append the literal */
12072 || (LOC /* Also don't fold for tricky chars under /l */
12073 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12076 const STRLEN unilen = reguni(pRExC_state, ender, s);
12082 /* The loop increments <len> each time, as all but this
12083 * path (and one other) through it add a single byte to
12084 * the EXACTish node. But this one has changed len to
12085 * be the correct final value, so subtract one to
12086 * cancel out the increment that follows */
12090 REGC((char)ender, s++);
12093 /* Can get here if folding only if is one of the /l
12094 * characters whose fold depends on the locale. The
12095 * occurrence of any of these indicate that we can't
12096 * simplify things */
12098 maybe_exact = FALSE;
12099 maybe_exactfu = FALSE;
12104 /* See comments for join_exact() as to why we fold this
12105 * non-UTF at compile time */
12106 || (node_type == EXACTFU
12107 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12109 /* Here, are folding and are not UTF-8 encoded; therefore
12110 * the character must be in the range 0-255, and is not /l
12111 * (Not /l because we already handled these under /l in
12112 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12113 if (IS_IN_SOME_FOLD_L1(ender)) {
12114 maybe_exact = FALSE;
12116 /* See if the character's fold differs between /d and
12117 * /u. This includes the multi-char fold SHARP S to
12120 && (PL_fold[ender] != PL_fold_latin1[ender]
12121 || ender == LATIN_SMALL_LETTER_SHARP_S
12123 && isARG2_lower_or_UPPER_ARG1('s', ender)
12124 && isARG2_lower_or_UPPER_ARG1('s',
12127 maybe_exactfu = FALSE;
12131 /* Even when folding, we store just the input character, as
12132 * we have an array that finds its fold quickly */
12133 *(s++) = (char) ender;
12135 else { /* FOLD and UTF */
12136 /* Unlike the non-fold case, we do actually have to
12137 * calculate the results here in pass 1. This is for two
12138 * reasons, the folded length may be longer than the
12139 * unfolded, and we have to calculate how many EXACTish
12140 * nodes it will take; and we may run out of room in a node
12141 * in the middle of a potential multi-char fold, and have
12142 * to back off accordingly. (Hence we can't use REGC for
12143 * the simple case just below.) */
12146 if (isASCII(ender)) {
12147 folded = toFOLD(ender);
12148 *(s)++ = (U8) folded;
12153 folded = _to_uni_fold_flags(
12157 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12158 ? FOLD_FLAGS_NOMIX_ASCII
12162 /* The loop increments <len> each time, as all but this
12163 * path (and one other) through it add a single byte to
12164 * the EXACTish node. But this one has changed len to
12165 * be the correct final value, so subtract one to
12166 * cancel out the increment that follows */
12167 len += foldlen - 1;
12169 /* If this node only contains non-folding code points so
12170 * far, see if this new one is also non-folding */
12172 if (folded != ender) {
12173 maybe_exact = FALSE;
12176 /* Here the fold is the original; we have to check
12177 * further to see if anything folds to it */
12178 if (_invlist_contains_cp(PL_utf8_foldable,
12181 maybe_exact = FALSE;
12188 if (next_is_quantifier) {
12190 /* Here, the next input is a quantifier, and to get here,
12191 * the current character is the only one in the node.
12192 * Also, here <len> doesn't include the final byte for this
12198 } /* End of loop through literal characters */
12200 /* Here we have either exhausted the input or ran out of room in
12201 * the node. (If we encountered a character that can't be in the
12202 * node, transfer is made directly to <loopdone>, and so we
12203 * wouldn't have fallen off the end of the loop.) In the latter
12204 * case, we artificially have to split the node into two, because
12205 * we just don't have enough space to hold everything. This
12206 * creates a problem if the final character participates in a
12207 * multi-character fold in the non-final position, as a match that
12208 * should have occurred won't, due to the way nodes are matched,
12209 * and our artificial boundary. So back off until we find a non-
12210 * problematic character -- one that isn't at the beginning or
12211 * middle of such a fold. (Either it doesn't participate in any
12212 * folds, or appears only in the final position of all the folds it
12213 * does participate in.) A better solution with far fewer false
12214 * positives, and that would fill the nodes more completely, would
12215 * be to actually have available all the multi-character folds to
12216 * test against, and to back-off only far enough to be sure that
12217 * this node isn't ending with a partial one. <upper_parse> is set
12218 * further below (if we need to reparse the node) to include just
12219 * up through that final non-problematic character that this code
12220 * identifies, so when it is set to less than the full node, we can
12221 * skip the rest of this */
12222 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12224 const STRLEN full_len = len;
12226 assert(len >= MAX_NODE_STRING_SIZE);
12228 /* Here, <s> points to the final byte of the final character.
12229 * Look backwards through the string until find a non-
12230 * problematic character */
12234 /* This has no multi-char folds to non-UTF characters */
12235 if (ASCII_FOLD_RESTRICTED) {
12239 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12243 if (! PL_NonL1NonFinalFold) {
12244 PL_NonL1NonFinalFold = _new_invlist_C_array(
12245 NonL1_Perl_Non_Final_Folds_invlist);
12248 /* Point to the first byte of the final character */
12249 s = (char *) utf8_hop((U8 *) s, -1);
12251 while (s >= s0) { /* Search backwards until find
12252 non-problematic char */
12253 if (UTF8_IS_INVARIANT(*s)) {
12255 /* There are no ascii characters that participate
12256 * in multi-char folds under /aa. In EBCDIC, the
12257 * non-ascii invariants are all control characters,
12258 * so don't ever participate in any folds. */
12259 if (ASCII_FOLD_RESTRICTED
12260 || ! IS_NON_FINAL_FOLD(*s))
12265 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12266 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12272 else if (! _invlist_contains_cp(
12273 PL_NonL1NonFinalFold,
12274 valid_utf8_to_uvchr((U8 *) s, NULL)))
12279 /* Here, the current character is problematic in that
12280 * it does occur in the non-final position of some
12281 * fold, so try the character before it, but have to
12282 * special case the very first byte in the string, so
12283 * we don't read outside the string */
12284 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12285 } /* End of loop backwards through the string */
12287 /* If there were only problematic characters in the string,
12288 * <s> will point to before s0, in which case the length
12289 * should be 0, otherwise include the length of the
12290 * non-problematic character just found */
12291 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12294 /* Here, have found the final character, if any, that is
12295 * non-problematic as far as ending the node without splitting
12296 * it across a potential multi-char fold. <len> contains the
12297 * number of bytes in the node up-to and including that
12298 * character, or is 0 if there is no such character, meaning
12299 * the whole node contains only problematic characters. In
12300 * this case, give up and just take the node as-is. We can't
12305 /* If the node ends in an 's' we make sure it stays EXACTF,
12306 * as if it turns into an EXACTFU, it could later get
12307 * joined with another 's' that would then wrongly match
12309 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12311 maybe_exactfu = FALSE;
12315 /* Here, the node does contain some characters that aren't
12316 * problematic. If one such is the final character in the
12317 * node, we are done */
12318 if (len == full_len) {
12321 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12323 /* If the final character is problematic, but the
12324 * penultimate is not, back-off that last character to
12325 * later start a new node with it */
12330 /* Here, the final non-problematic character is earlier
12331 * in the input than the penultimate character. What we do
12332 * is reparse from the beginning, going up only as far as
12333 * this final ok one, thus guaranteeing that the node ends
12334 * in an acceptable character. The reason we reparse is
12335 * that we know how far in the character is, but we don't
12336 * know how to correlate its position with the input parse.
12337 * An alternate implementation would be to build that
12338 * correlation as we go along during the original parse,
12339 * but that would entail extra work for every node, whereas
12340 * this code gets executed only when the string is too
12341 * large for the node, and the final two characters are
12342 * problematic, an infrequent occurrence. Yet another
12343 * possible strategy would be to save the tail of the
12344 * string, and the next time regatom is called, initialize
12345 * with that. The problem with this is that unless you
12346 * back off one more character, you won't be guaranteed
12347 * regatom will get called again, unless regbranch,
12348 * regpiece ... are also changed. If you do back off that
12349 * extra character, so that there is input guaranteed to
12350 * force calling regatom, you can't handle the case where
12351 * just the first character in the node is acceptable. I
12352 * (khw) decided to try this method which doesn't have that
12353 * pitfall; if performance issues are found, we can do a
12354 * combination of the current approach plus that one */
12360 } /* End of verifying node ends with an appropriate char */
12362 loopdone: /* Jumped to when encounters something that shouldn't be in
12365 /* I (khw) don't know if you can get here with zero length, but the
12366 * old code handled this situation by creating a zero-length EXACT
12367 * node. Might as well be NOTHING instead */
12373 /* If 'maybe_exact' is still set here, means there are no
12374 * code points in the node that participate in folds;
12375 * similarly for 'maybe_exactfu' and code points that match
12376 * differently depending on UTF8ness of the target string
12377 * (for /u), or depending on locale for /l */
12381 else if (maybe_exactfu) {
12385 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12386 FALSE /* Don't look to see if could
12387 be turned into an EXACT
12388 node, as we have already
12393 RExC_parse = p - 1;
12394 Set_Node_Cur_Length(ret, parse_start);
12395 nextchar(pRExC_state);
12397 /* len is STRLEN which is unsigned, need to copy to signed */
12400 vFAIL("Internal disaster");
12403 } /* End of label 'defchar:' */
12405 } /* End of giant switch on input character */
12411 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12413 /* Returns the next non-pattern-white space, non-comment character (the
12414 * latter only if 'recognize_comment is true) in the string p, which is
12415 * ended by RExC_end. See also reg_skipcomment */
12416 const char *e = RExC_end;
12418 PERL_ARGS_ASSERT_REGPATWS;
12422 if ((len = is_PATWS_safe(p, e, UTF))) {
12425 else if (recognize_comment && *p == '#') {
12426 p = reg_skipcomment(pRExC_state, p);
12435 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12437 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12438 * sets up the bitmap and any flags, removing those code points from the
12439 * inversion list, setting it to NULL should it become completely empty */
12441 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12442 assert(PL_regkind[OP(node)] == ANYOF);
12444 ANYOF_BITMAP_ZERO(node);
12445 if (*invlist_ptr) {
12447 /* This gets set if we actually need to modify things */
12448 bool change_invlist = FALSE;
12452 /* Start looking through *invlist_ptr */
12453 invlist_iterinit(*invlist_ptr);
12454 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12458 if (end == UV_MAX && start <= 256) {
12459 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12461 else if (end >= 256) {
12462 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12465 /* Quit if are above what we should change */
12470 change_invlist = TRUE;
12472 /* Set all the bits in the range, up to the max that we are doing */
12473 high = (end < 255) ? end : 255;
12474 for (i = start; i <= (int) high; i++) {
12475 if (! ANYOF_BITMAP_TEST(node, i)) {
12476 ANYOF_BITMAP_SET(node, i);
12480 invlist_iterfinish(*invlist_ptr);
12482 /* Done with loop; remove any code points that are in the bitmap from
12483 * *invlist_ptr; similarly for code points above latin1 if we have a
12484 * flag to match all of them anyways */
12485 if (change_invlist) {
12486 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12488 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12489 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12492 /* If have completely emptied it, remove it completely */
12493 if (_invlist_len(*invlist_ptr) == 0) {
12494 SvREFCNT_dec_NN(*invlist_ptr);
12495 *invlist_ptr = NULL;
12500 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12501 Character classes ([:foo:]) can also be negated ([:^foo:]).
12502 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12503 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12504 but trigger failures because they are currently unimplemented. */
12506 #define POSIXCC_DONE(c) ((c) == ':')
12507 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12508 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12510 PERL_STATIC_INLINE I32
12511 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12514 I32 namedclass = OOB_NAMEDCLASS;
12516 PERL_ARGS_ASSERT_REGPPOSIXCC;
12518 if (value == '[' && RExC_parse + 1 < RExC_end &&
12519 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12520 POSIXCC(UCHARAT(RExC_parse)))
12522 const char c = UCHARAT(RExC_parse);
12523 char* const s = RExC_parse++;
12525 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12527 if (RExC_parse == RExC_end) {
12530 /* Try to give a better location for the error (than the end of
12531 * the string) by looking for the matching ']' */
12533 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12536 vFAIL2("Unmatched '%c' in POSIX class", c);
12538 /* Grandfather lone [:, [=, [. */
12542 const char* const t = RExC_parse++; /* skip over the c */
12545 if (UCHARAT(RExC_parse) == ']') {
12546 const char *posixcc = s + 1;
12547 RExC_parse++; /* skip over the ending ] */
12550 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12551 const I32 skip = t - posixcc;
12553 /* Initially switch on the length of the name. */
12556 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12557 this is the Perl \w
12559 namedclass = ANYOF_WORDCHAR;
12562 /* Names all of length 5. */
12563 /* alnum alpha ascii blank cntrl digit graph lower
12564 print punct space upper */
12565 /* Offset 4 gives the best switch position. */
12566 switch (posixcc[4]) {
12568 if (memEQ(posixcc, "alph", 4)) /* alpha */
12569 namedclass = ANYOF_ALPHA;
12572 if (memEQ(posixcc, "spac", 4)) /* space */
12573 namedclass = ANYOF_PSXSPC;
12576 if (memEQ(posixcc, "grap", 4)) /* graph */
12577 namedclass = ANYOF_GRAPH;
12580 if (memEQ(posixcc, "asci", 4)) /* ascii */
12581 namedclass = ANYOF_ASCII;
12584 if (memEQ(posixcc, "blan", 4)) /* blank */
12585 namedclass = ANYOF_BLANK;
12588 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12589 namedclass = ANYOF_CNTRL;
12592 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12593 namedclass = ANYOF_ALPHANUMERIC;
12596 if (memEQ(posixcc, "lowe", 4)) /* lower */
12597 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12598 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12599 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12602 if (memEQ(posixcc, "digi", 4)) /* digit */
12603 namedclass = ANYOF_DIGIT;
12604 else if (memEQ(posixcc, "prin", 4)) /* print */
12605 namedclass = ANYOF_PRINT;
12606 else if (memEQ(posixcc, "punc", 4)) /* punct */
12607 namedclass = ANYOF_PUNCT;
12612 if (memEQ(posixcc, "xdigit", 6))
12613 namedclass = ANYOF_XDIGIT;
12617 if (namedclass == OOB_NAMEDCLASS)
12619 "POSIX class [:%"UTF8f":] unknown",
12620 UTF8fARG(UTF, t - s - 1, s + 1));
12622 /* The #defines are structured so each complement is +1 to
12623 * the normal one */
12627 assert (posixcc[skip] == ':');
12628 assert (posixcc[skip+1] == ']');
12629 } else if (!SIZE_ONLY) {
12630 /* [[=foo=]] and [[.foo.]] are still future. */
12632 /* adjust RExC_parse so the warning shows after
12633 the class closes */
12634 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12636 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12639 /* Maternal grandfather:
12640 * "[:" ending in ":" but not in ":]" */
12642 vFAIL("Unmatched '[' in POSIX class");
12645 /* Grandfather lone [:, [=, [. */
12655 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12657 /* This applies some heuristics at the current parse position (which should
12658 * be at a '[') to see if what follows might be intended to be a [:posix:]
12659 * class. It returns true if it really is a posix class, of course, but it
12660 * also can return true if it thinks that what was intended was a posix
12661 * class that didn't quite make it.
12663 * It will return true for
12665 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12666 * ')' indicating the end of the (?[
12667 * [:any garbage including %^&$ punctuation:]
12669 * This is designed to be called only from S_handle_regex_sets; it could be
12670 * easily adapted to be called from the spot at the beginning of regclass()
12671 * that checks to see in a normal bracketed class if the surrounding []
12672 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12673 * change long-standing behavior, so I (khw) didn't do that */
12674 char* p = RExC_parse + 1;
12675 char first_char = *p;
12677 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12679 assert(*(p - 1) == '[');
12681 if (! POSIXCC(first_char)) {
12686 while (p < RExC_end && isWORDCHAR(*p)) p++;
12688 if (p >= RExC_end) {
12692 if (p - RExC_parse > 2 /* Got at least 1 word character */
12693 && (*p == first_char
12694 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12699 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12702 && p - RExC_parse > 2 /* [:] evaluates to colon;
12703 [::] is a bad posix class. */
12704 && first_char == *(p - 1));
12708 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12709 I32 *flagp, U32 depth,
12710 char * const oregcomp_parse)
12712 /* Handle the (?[...]) construct to do set operations */
12715 UV start, end; /* End points of code point ranges */
12717 char *save_end, *save_parse;
12722 const bool save_fold = FOLD;
12724 GET_RE_DEBUG_FLAGS_DECL;
12726 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12729 vFAIL("(?[...]) not valid in locale");
12731 RExC_uni_semantics = 1;
12733 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12734 * (such as EXACT). Thus we can skip most everything if just sizing. We
12735 * call regclass to handle '[]' so as to not have to reinvent its parsing
12736 * rules here (throwing away the size it computes each time). And, we exit
12737 * upon an unescaped ']' that isn't one ending a regclass. To do both
12738 * these things, we need to realize that something preceded by a backslash
12739 * is escaped, so we have to keep track of backslashes */
12741 UV depth = 0; /* how many nested (?[...]) constructs */
12743 Perl_ck_warner_d(aTHX_
12744 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12745 "The regex_sets feature is experimental" REPORT_LOCATION,
12746 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12748 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12749 RExC_precomp + (RExC_parse - RExC_precomp)));
12751 while (RExC_parse < RExC_end) {
12752 SV* current = NULL;
12753 RExC_parse = regpatws(pRExC_state, RExC_parse,
12754 TRUE); /* means recognize comments */
12755 switch (*RExC_parse) {
12757 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12762 /* Skip the next byte (which could cause us to end up in
12763 * the middle of a UTF-8 character, but since none of those
12764 * are confusable with anything we currently handle in this
12765 * switch (invariants all), it's safe. We'll just hit the
12766 * default: case next time and keep on incrementing until
12767 * we find one of the invariants we do handle. */
12772 /* If this looks like it is a [:posix:] class, leave the
12773 * parse pointer at the '[' to fool regclass() into
12774 * thinking it is part of a '[[:posix:]]'. That function
12775 * will use strict checking to force a syntax error if it
12776 * doesn't work out to a legitimate class */
12777 bool is_posix_class
12778 = could_it_be_a_POSIX_class(pRExC_state);
12779 if (! is_posix_class) {
12783 /* regclass() can only return RESTART_UTF8 if multi-char
12784 folds are allowed. */
12785 if (!regclass(pRExC_state, flagp,depth+1,
12786 is_posix_class, /* parse the whole char
12787 class only if not a
12789 FALSE, /* don't allow multi-char folds */
12790 TRUE, /* silence non-portable warnings. */
12792 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12795 /* function call leaves parse pointing to the ']', except
12796 * if we faked it */
12797 if (is_posix_class) {
12801 SvREFCNT_dec(current); /* In case it returned something */
12806 if (depth--) break;
12808 if (RExC_parse < RExC_end
12809 && *RExC_parse == ')')
12811 node = reganode(pRExC_state, ANYOF, 0);
12812 RExC_size += ANYOF_SKIP;
12813 nextchar(pRExC_state);
12814 Set_Node_Length(node,
12815 RExC_parse - oregcomp_parse + 1); /* MJD */
12824 FAIL("Syntax error in (?[...])");
12827 /* Pass 2 only after this. Everything in this construct is a
12828 * metacharacter. Operands begin with either a '\' (for an escape
12829 * sequence), or a '[' for a bracketed character class. Any other
12830 * character should be an operator, or parenthesis for grouping. Both
12831 * types of operands are handled by calling regclass() to parse them. It
12832 * is called with a parameter to indicate to return the computed inversion
12833 * list. The parsing here is implemented via a stack. Each entry on the
12834 * stack is a single character representing one of the operators, or the
12835 * '('; or else a pointer to an operand inversion list. */
12837 #define IS_OPERAND(a) (! SvIOK(a))
12839 /* The stack starts empty. It is a syntax error if the first thing parsed
12840 * is a binary operator; everything else is pushed on the stack. When an
12841 * operand is parsed, the top of the stack is examined. If it is a binary
12842 * operator, the item before it should be an operand, and both are replaced
12843 * by the result of doing that operation on the new operand and the one on
12844 * the stack. Thus a sequence of binary operands is reduced to a single
12845 * one before the next one is parsed.
12847 * A unary operator may immediately follow a binary in the input, for
12850 * When an operand is parsed and the top of the stack is a unary operator,
12851 * the operation is performed, and then the stack is rechecked to see if
12852 * this new operand is part of a binary operation; if so, it is handled as
12855 * A '(' is simply pushed on the stack; it is valid only if the stack is
12856 * empty, or the top element of the stack is an operator or another '('
12857 * (for which the parenthesized expression will become an operand). By the
12858 * time the corresponding ')' is parsed everything in between should have
12859 * been parsed and evaluated to a single operand (or else is a syntax
12860 * error), and is handled as a regular operand */
12862 sv_2mortal((SV *)(stack = newAV()));
12864 while (RExC_parse < RExC_end) {
12865 I32 top_index = av_tindex(stack);
12867 SV* current = NULL;
12869 /* Skip white space */
12870 RExC_parse = regpatws(pRExC_state, RExC_parse,
12871 TRUE /* means recognize comments */ );
12872 if (RExC_parse >= RExC_end) {
12873 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12875 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12882 if (av_tindex(stack) >= 0 /* This makes sure that we can
12883 safely subtract 1 from
12884 RExC_parse in the next clause.
12885 If we have something on the
12886 stack, we have parsed something
12888 && UCHARAT(RExC_parse - 1) == '('
12889 && RExC_parse < RExC_end)
12891 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12892 * This happens when we have some thing like
12894 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12896 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12898 * Here we would be handling the interpolated
12899 * '$thai_or_lao'. We handle this by a recursive call to
12900 * ourselves which returns the inversion list the
12901 * interpolated expression evaluates to. We use the flags
12902 * from the interpolated pattern. */
12903 U32 save_flags = RExC_flags;
12904 const char * const save_parse = ++RExC_parse;
12906 parse_lparen_question_flags(pRExC_state);
12908 if (RExC_parse == save_parse /* Makes sure there was at
12909 least one flag (or this
12910 embedding wasn't compiled)
12912 || RExC_parse >= RExC_end - 4
12913 || UCHARAT(RExC_parse) != ':'
12914 || UCHARAT(++RExC_parse) != '('
12915 || UCHARAT(++RExC_parse) != '?'
12916 || UCHARAT(++RExC_parse) != '[')
12919 /* In combination with the above, this moves the
12920 * pointer to the point just after the first erroneous
12921 * character (or if there are no flags, to where they
12922 * should have been) */
12923 if (RExC_parse >= RExC_end - 4) {
12924 RExC_parse = RExC_end;
12926 else if (RExC_parse != save_parse) {
12927 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12929 vFAIL("Expecting '(?flags:(?[...'");
12932 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12933 depth+1, oregcomp_parse);
12935 /* Here, 'current' contains the embedded expression's
12936 * inversion list, and RExC_parse points to the trailing
12937 * ']'; the next character should be the ')' which will be
12938 * paired with the '(' that has been put on the stack, so
12939 * the whole embedded expression reduces to '(operand)' */
12942 RExC_flags = save_flags;
12943 goto handle_operand;
12948 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12949 vFAIL("Unexpected character");
12952 /* regclass() can only return RESTART_UTF8 if multi-char
12953 folds are allowed. */
12954 if (!regclass(pRExC_state, flagp,depth+1,
12955 TRUE, /* means parse just the next thing */
12956 FALSE, /* don't allow multi-char folds */
12957 FALSE, /* don't silence non-portable warnings. */
12959 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12961 /* regclass() will return with parsing just the \ sequence,
12962 * leaving the parse pointer at the next thing to parse */
12964 goto handle_operand;
12966 case '[': /* Is a bracketed character class */
12968 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12970 if (! is_posix_class) {
12974 /* regclass() can only return RESTART_UTF8 if multi-char
12975 folds are allowed. */
12976 if(!regclass(pRExC_state, flagp,depth+1,
12977 is_posix_class, /* parse the whole char class
12978 only if not a posix class */
12979 FALSE, /* don't allow multi-char folds */
12980 FALSE, /* don't silence non-portable warnings. */
12982 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12984 /* function call leaves parse pointing to the ']', except if we
12986 if (is_posix_class) {
12990 goto handle_operand;
12999 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13000 || ! IS_OPERAND(*top_ptr))
13003 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13005 av_push(stack, newSVuv(curchar));
13009 av_push(stack, newSVuv(curchar));
13013 if (top_index >= 0) {
13014 top_ptr = av_fetch(stack, top_index, FALSE);
13016 if (IS_OPERAND(*top_ptr)) {
13018 vFAIL("Unexpected '(' with no preceding operator");
13021 av_push(stack, newSVuv(curchar));
13028 || ! (current = av_pop(stack))
13029 || ! IS_OPERAND(current)
13030 || ! (lparen = av_pop(stack))
13031 || IS_OPERAND(lparen)
13032 || SvUV(lparen) != '(')
13034 SvREFCNT_dec(current);
13036 vFAIL("Unexpected ')'");
13039 SvREFCNT_dec_NN(lparen);
13046 /* Here, we have an operand to process, in 'current' */
13048 if (top_index < 0) { /* Just push if stack is empty */
13049 av_push(stack, current);
13052 SV* top = av_pop(stack);
13054 char current_operator;
13056 if (IS_OPERAND(top)) {
13057 SvREFCNT_dec_NN(top);
13058 SvREFCNT_dec_NN(current);
13059 vFAIL("Operand with no preceding operator");
13061 current_operator = (char) SvUV(top);
13062 switch (current_operator) {
13063 case '(': /* Push the '(' back on followed by the new
13065 av_push(stack, top);
13066 av_push(stack, current);
13067 SvREFCNT_inc(top); /* Counters the '_dec' done
13068 just after the 'break', so
13069 it doesn't get wrongly freed
13074 _invlist_invert(current);
13076 /* Unlike binary operators, the top of the stack,
13077 * now that this unary one has been popped off, may
13078 * legally be an operator, and we now have operand
13081 SvREFCNT_dec_NN(top);
13082 goto handle_operand;
13085 prev = av_pop(stack);
13086 _invlist_intersection(prev,
13089 av_push(stack, current);
13094 prev = av_pop(stack);
13095 _invlist_union(prev, current, ¤t);
13096 av_push(stack, current);
13100 prev = av_pop(stack);;
13101 _invlist_subtract(prev, current, ¤t);
13102 av_push(stack, current);
13105 case '^': /* The union minus the intersection */
13111 prev = av_pop(stack);
13112 _invlist_union(prev, current, &u);
13113 _invlist_intersection(prev, current, &i);
13114 /* _invlist_subtract will overwrite current
13115 without freeing what it already contains */
13117 _invlist_subtract(u, i, ¤t);
13118 av_push(stack, current);
13119 SvREFCNT_dec_NN(i);
13120 SvREFCNT_dec_NN(u);
13121 SvREFCNT_dec_NN(element);
13126 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13128 SvREFCNT_dec_NN(top);
13129 SvREFCNT_dec(prev);
13133 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13136 if (av_tindex(stack) < 0 /* Was empty */
13137 || ((final = av_pop(stack)) == NULL)
13138 || ! IS_OPERAND(final)
13139 || av_tindex(stack) >= 0) /* More left on stack */
13141 vFAIL("Incomplete expression within '(?[ ])'");
13144 /* Here, 'final' is the resultant inversion list from evaluating the
13145 * expression. Return it if so requested */
13146 if (return_invlist) {
13147 *return_invlist = final;
13151 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13152 * expecting a string of ranges and individual code points */
13153 invlist_iterinit(final);
13154 result_string = newSVpvs("");
13155 while (invlist_iternext(final, &start, &end)) {
13156 if (start == end) {
13157 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13160 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13165 save_parse = RExC_parse;
13166 RExC_parse = SvPV(result_string, len);
13167 save_end = RExC_end;
13168 RExC_end = RExC_parse + len;
13170 /* We turn off folding around the call, as the class we have constructed
13171 * already has all folding taken into consideration, and we don't want
13172 * regclass() to add to that */
13173 RExC_flags &= ~RXf_PMf_FOLD;
13174 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13176 node = regclass(pRExC_state, flagp,depth+1,
13177 FALSE, /* means parse the whole char class */
13178 FALSE, /* don't allow multi-char folds */
13179 TRUE, /* silence non-portable warnings. The above may very
13180 well have generated non-portable code points, but
13181 they're valid on this machine */
13184 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13187 RExC_flags |= RXf_PMf_FOLD;
13189 RExC_parse = save_parse + 1;
13190 RExC_end = save_end;
13191 SvREFCNT_dec_NN(final);
13192 SvREFCNT_dec_NN(result_string);
13194 nextchar(pRExC_state);
13195 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13201 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13203 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13204 * innocent-looking character class, like /[ks]/i won't have to go out to
13205 * disk to find the possible matches.
13207 * This should be called only for a Latin1-range code points, cp, which is
13208 * known to be involved in a fold with other code points above Latin1. It
13209 * would give false results if /aa has been specified. Multi-char folds
13210 * are outside the scope of this, and must be handled specially.
13212 * XXX It would be better to generate these via regen, in case a new
13213 * version of the Unicode standard adds new mappings, though that is not
13214 * really likely, and may be caught by the default: case of the switch
13217 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13223 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13227 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13230 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13231 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13233 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13234 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13235 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13237 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13238 *invlist = add_cp_to_invlist(*invlist,
13239 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13241 case LATIN_SMALL_LETTER_SHARP_S:
13242 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13244 case 'F': case 'f':
13245 case 'I': case 'i':
13246 case 'L': case 'l':
13247 case 'T': case 't':
13248 case 'A': case 'a':
13249 case 'H': case 'h':
13250 case 'J': case 'j':
13251 case 'N': case 'n':
13252 case 'W': case 'w':
13253 case 'Y': case 'y':
13254 /* These all are targets of multi-character folds from code points
13255 * that require UTF8 to express, so they can't match unless the
13256 * target string is in UTF-8, so no action here is necessary, as
13257 * regexec.c properly handles the general case for UTF-8 matching
13258 * and multi-char folds */
13261 /* Use deprecated warning to increase the chances of this being
13263 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13268 /* The names of properties whose definitions are not known at compile time are
13269 * stored in this SV, after a constant heading. So if the length has been
13270 * changed since initialization, then there is a run-time definition. */
13271 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13272 (SvCUR(listsv) != initial_listsv_len)
13275 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13276 const bool stop_at_1, /* Just parse the next thing, don't
13277 look for a full character class */
13278 bool allow_multi_folds,
13279 const bool silence_non_portable, /* Don't output warnings
13282 SV** ret_invlist) /* Return an inversion list, not a node */
13284 /* parse a bracketed class specification. Most of these will produce an
13285 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13286 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13287 * under /i with multi-character folds: it will be rewritten following the
13288 * paradigm of this example, where the <multi-fold>s are characters which
13289 * fold to multiple character sequences:
13290 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13291 * gets effectively rewritten as:
13292 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13293 * reg() gets called (recursively) on the rewritten version, and this
13294 * function will return what it constructs. (Actually the <multi-fold>s
13295 * aren't physically removed from the [abcdefghi], it's just that they are
13296 * ignored in the recursion by means of a flag:
13297 * <RExC_in_multi_char_class>.)
13299 * ANYOF nodes contain a bit map for the first 256 characters, with the
13300 * corresponding bit set if that character is in the list. For characters
13301 * above 255, a range list or swash is used. There are extra bits for \w,
13302 * etc. in locale ANYOFs, as what these match is not determinable at
13305 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13306 * to be restarted. This can only happen if ret_invlist is non-NULL.
13310 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13312 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13315 IV namedclass = OOB_NAMEDCLASS;
13316 char *rangebegin = NULL;
13317 bool need_class = 0;
13319 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13320 than just initialized. */
13321 SV* properties = NULL; /* Code points that match \p{} \P{} */
13322 SV* posixes = NULL; /* Code points that match classes like [:word:],
13323 extended beyond the Latin1 range. These have to
13324 be kept separate from other code points for much
13325 of this function because their handling is
13326 different under /i, and for most classes under
13328 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13329 separate for a while from the non-complemented
13330 versions because of complications with /d
13332 UV element_count = 0; /* Number of distinct elements in the class.
13333 Optimizations may be possible if this is tiny */
13334 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13335 character; used under /i */
13337 char * stop_ptr = RExC_end; /* where to stop parsing */
13338 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13340 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13342 /* Unicode properties are stored in a swash; this holds the current one
13343 * being parsed. If this swash is the only above-latin1 component of the
13344 * character class, an optimization is to pass it directly on to the
13345 * execution engine. Otherwise, it is set to NULL to indicate that there
13346 * are other things in the class that have to be dealt with at execution
13348 SV* swash = NULL; /* Code points that match \p{} \P{} */
13350 /* Set if a component of this character class is user-defined; just passed
13351 * on to the engine */
13352 bool has_user_defined_property = FALSE;
13354 /* inversion list of code points this node matches only when the target
13355 * string is in UTF-8. (Because is under /d) */
13356 SV* depends_list = NULL;
13358 /* Inversion list of code points this node matches regardless of things
13359 * like locale, folding, utf8ness of the target string */
13360 SV* cp_list = NULL;
13362 /* Like cp_list, but code points on this list need to be checked for things
13363 * that fold to/from them under /i */
13364 SV* cp_foldable_list = NULL;
13366 /* Like cp_list, but code points on this list are valid only when the
13367 * runtime locale is UTF-8 */
13368 SV* only_utf8_locale_list = NULL;
13371 /* In a range, counts how many 0-2 of the ends of it came from literals,
13372 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13373 UV literal_endpoint = 0;
13375 bool invert = FALSE; /* Is this class to be complemented */
13377 bool warn_super = ALWAYS_WARN_SUPER;
13379 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13380 case we need to change the emitted regop to an EXACT. */
13381 const char * orig_parse = RExC_parse;
13382 const SSize_t orig_size = RExC_size;
13383 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13384 GET_RE_DEBUG_FLAGS_DECL;
13386 PERL_ARGS_ASSERT_REGCLASS;
13388 PERL_UNUSED_ARG(depth);
13391 DEBUG_PARSE("clas");
13393 /* Assume we are going to generate an ANYOF node. */
13394 ret = reganode(pRExC_state, ANYOF, 0);
13397 RExC_size += ANYOF_SKIP;
13398 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13401 ANYOF_FLAGS(ret) = 0;
13403 RExC_emit += ANYOF_SKIP;
13404 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13405 initial_listsv_len = SvCUR(listsv);
13406 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13410 RExC_parse = regpatws(pRExC_state, RExC_parse,
13411 FALSE /* means don't recognize comments */ );
13414 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13417 allow_multi_folds = FALSE;
13420 RExC_parse = regpatws(pRExC_state, RExC_parse,
13421 FALSE /* means don't recognize comments */ );
13425 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13426 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13427 const char *s = RExC_parse;
13428 const char c = *s++;
13430 while (isWORDCHAR(*s))
13432 if (*s && c == *s && s[1] == ']') {
13433 SAVEFREESV(RExC_rx_sv);
13435 "POSIX syntax [%c %c] belongs inside character classes",
13437 (void)ReREFCNT_inc(RExC_rx_sv);
13441 /* If the caller wants us to just parse a single element, accomplish this
13442 * by faking the loop ending condition */
13443 if (stop_at_1 && RExC_end > RExC_parse) {
13444 stop_ptr = RExC_parse + 1;
13447 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13448 if (UCHARAT(RExC_parse) == ']')
13449 goto charclassloop;
13453 if (RExC_parse >= stop_ptr) {
13458 RExC_parse = regpatws(pRExC_state, RExC_parse,
13459 FALSE /* means don't recognize comments */ );
13462 if (UCHARAT(RExC_parse) == ']') {
13468 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13469 save_value = value;
13470 save_prevvalue = prevvalue;
13473 rangebegin = RExC_parse;
13477 value = utf8n_to_uvchr((U8*)RExC_parse,
13478 RExC_end - RExC_parse,
13479 &numlen, UTF8_ALLOW_DEFAULT);
13480 RExC_parse += numlen;
13483 value = UCHARAT(RExC_parse++);
13486 && RExC_parse < RExC_end
13487 && POSIXCC(UCHARAT(RExC_parse)))
13489 namedclass = regpposixcc(pRExC_state, value, strict);
13491 else if (value == '\\') {
13493 value = utf8n_to_uvchr((U8*)RExC_parse,
13494 RExC_end - RExC_parse,
13495 &numlen, UTF8_ALLOW_DEFAULT);
13496 RExC_parse += numlen;
13499 value = UCHARAT(RExC_parse++);
13501 /* Some compilers cannot handle switching on 64-bit integer
13502 * values, therefore value cannot be an UV. Yes, this will
13503 * be a problem later if we want switch on Unicode.
13504 * A similar issue a little bit later when switching on
13505 * namedclass. --jhi */
13507 /* If the \ is escaping white space when white space is being
13508 * skipped, it means that that white space is wanted literally, and
13509 * is already in 'value'. Otherwise, need to translate the escape
13510 * into what it signifies. */
13511 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13513 case 'w': namedclass = ANYOF_WORDCHAR; break;
13514 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13515 case 's': namedclass = ANYOF_SPACE; break;
13516 case 'S': namedclass = ANYOF_NSPACE; break;
13517 case 'd': namedclass = ANYOF_DIGIT; break;
13518 case 'D': namedclass = ANYOF_NDIGIT; break;
13519 case 'v': namedclass = ANYOF_VERTWS; break;
13520 case 'V': namedclass = ANYOF_NVERTWS; break;
13521 case 'h': namedclass = ANYOF_HORIZWS; break;
13522 case 'H': namedclass = ANYOF_NHORIZWS; break;
13523 case 'N': /* Handle \N{NAME} in class */
13525 /* We only pay attention to the first char of
13526 multichar strings being returned. I kinda wonder
13527 if this makes sense as it does change the behaviour
13528 from earlier versions, OTOH that behaviour was broken
13530 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13531 TRUE, /* => charclass */
13534 if (*flagp & RESTART_UTF8)
13535 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13545 /* We will handle any undefined properties ourselves */
13546 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13547 /* And we actually would prefer to get
13548 * the straight inversion list of the
13549 * swash, since we will be accessing it
13550 * anyway, to save a little time */
13551 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13553 if (RExC_parse >= RExC_end)
13554 vFAIL2("Empty \\%c{}", (U8)value);
13555 if (*RExC_parse == '{') {
13556 const U8 c = (U8)value;
13557 e = strchr(RExC_parse++, '}');
13559 vFAIL2("Missing right brace on \\%c{}", c);
13560 while (isSPACE(*RExC_parse))
13562 if (e == RExC_parse)
13563 vFAIL2("Empty \\%c{}", c);
13564 n = e - RExC_parse;
13565 while (isSPACE(*(RExC_parse + n - 1)))
13576 if (UCHARAT(RExC_parse) == '^') {
13579 /* toggle. (The rhs xor gets the single bit that
13580 * differs between P and p; the other xor inverts just
13582 value ^= 'P' ^ 'p';
13584 while (isSPACE(*RExC_parse)) {
13589 /* Try to get the definition of the property into
13590 * <invlist>. If /i is in effect, the effective property
13591 * will have its name be <__NAME_i>. The design is
13592 * discussed in commit
13593 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13594 name = savepv(Perl_form(aTHX_
13596 (FOLD) ? "__" : "",
13602 /* Look up the property name, and get its swash and
13603 * inversion list, if the property is found */
13605 SvREFCNT_dec_NN(swash);
13607 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13610 NULL, /* No inversion list */
13613 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13614 HV* curpkg = (IN_PERL_COMPILETIME)
13616 : CopSTASH(PL_curcop);
13618 SvREFCNT_dec_NN(swash);
13622 /* Here didn't find it. It could be a user-defined
13623 * property that will be available at run-time. If we
13624 * accept only compile-time properties, is an error;
13625 * otherwise add it to the list for run-time look up */
13627 RExC_parse = e + 1;
13629 "Property '%"UTF8f"' is unknown",
13630 UTF8fARG(UTF, n, name));
13633 /* If the property name doesn't already have a package
13634 * name, add the current one to it so that it can be
13635 * referred to outside it. [perl #121777] */
13636 if (curpkg && ! instr(name, "::")) {
13637 char* pkgname = HvNAME(curpkg);
13638 if (strNE(pkgname, "main")) {
13639 char* full_name = Perl_form(aTHX_
13643 n = strlen(full_name);
13645 name = savepvn(full_name, n);
13648 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13649 (value == 'p' ? '+' : '!'),
13650 UTF8fARG(UTF, n, name));
13651 has_user_defined_property = TRUE;
13653 /* We don't know yet, so have to assume that the
13654 * property could match something in the Latin1 range,
13655 * hence something that isn't utf8. Note that this
13656 * would cause things in <depends_list> to match
13657 * inappropriately, except that any \p{}, including
13658 * this one forces Unicode semantics, which means there
13659 * is no <depends_list> */
13660 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13664 /* Here, did get the swash and its inversion list. If
13665 * the swash is from a user-defined property, then this
13666 * whole character class should be regarded as such */
13667 if (swash_init_flags
13668 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13670 has_user_defined_property = TRUE;
13673 /* We warn on matching an above-Unicode code point
13674 * if the match would return true, except don't
13675 * warn for \p{All}, which has exactly one element
13677 (_invlist_contains_cp(invlist, 0x110000)
13678 && (! (_invlist_len(invlist) == 1
13679 && *invlist_array(invlist) == 0)))
13685 /* Invert if asking for the complement */
13686 if (value == 'P') {
13687 _invlist_union_complement_2nd(properties,
13691 /* The swash can't be used as-is, because we've
13692 * inverted things; delay removing it to here after
13693 * have copied its invlist above */
13694 SvREFCNT_dec_NN(swash);
13698 _invlist_union(properties, invlist, &properties);
13703 RExC_parse = e + 1;
13704 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13707 /* \p means they want Unicode semantics */
13708 RExC_uni_semantics = 1;
13711 case 'n': value = '\n'; break;
13712 case 'r': value = '\r'; break;
13713 case 't': value = '\t'; break;
13714 case 'f': value = '\f'; break;
13715 case 'b': value = '\b'; break;
13716 case 'e': value = ASCII_TO_NATIVE('\033');break;
13717 case 'a': value = '\a'; break;
13719 RExC_parse--; /* function expects to be pointed at the 'o' */
13721 const char* error_msg;
13722 bool valid = grok_bslash_o(&RExC_parse,
13725 SIZE_ONLY, /* warnings in pass
13728 silence_non_portable,
13734 if (PL_encoding && value < 0x100) {
13735 goto recode_encoding;
13739 RExC_parse--; /* function expects to be pointed at the 'x' */
13741 const char* error_msg;
13742 bool valid = grok_bslash_x(&RExC_parse,
13745 TRUE, /* Output warnings */
13747 silence_non_portable,
13753 if (PL_encoding && value < 0x100)
13754 goto recode_encoding;
13757 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13759 case '0': case '1': case '2': case '3': case '4':
13760 case '5': case '6': case '7':
13762 /* Take 1-3 octal digits */
13763 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13764 numlen = (strict) ? 4 : 3;
13765 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13766 RExC_parse += numlen;
13769 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13770 vFAIL("Need exactly 3 octal digits");
13772 else if (! SIZE_ONLY /* like \08, \178 */
13774 && RExC_parse < RExC_end
13775 && isDIGIT(*RExC_parse)
13776 && ckWARN(WARN_REGEXP))
13778 SAVEFREESV(RExC_rx_sv);
13779 reg_warn_non_literal_string(
13781 form_short_octal_warning(RExC_parse, numlen));
13782 (void)ReREFCNT_inc(RExC_rx_sv);
13785 if (PL_encoding && value < 0x100)
13786 goto recode_encoding;
13790 if (! RExC_override_recoding) {
13791 SV* enc = PL_encoding;
13792 value = reg_recode((const char)(U8)value, &enc);
13795 vFAIL("Invalid escape in the specified encoding");
13797 else if (SIZE_ONLY) {
13798 ckWARNreg(RExC_parse,
13799 "Invalid escape in the specified encoding");
13805 /* Allow \_ to not give an error */
13806 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13808 vFAIL2("Unrecognized escape \\%c in character class",
13812 SAVEFREESV(RExC_rx_sv);
13813 ckWARN2reg(RExC_parse,
13814 "Unrecognized escape \\%c in character class passed through",
13816 (void)ReREFCNT_inc(RExC_rx_sv);
13820 } /* End of switch on char following backslash */
13821 } /* end of handling backslash escape sequences */
13824 literal_endpoint++;
13827 /* Here, we have the current token in 'value' */
13829 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13832 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13833 * literal, as is the character that began the false range, i.e.
13834 * the 'a' in the examples */
13837 const int w = (RExC_parse >= rangebegin)
13838 ? RExC_parse - rangebegin
13842 "False [] range \"%"UTF8f"\"",
13843 UTF8fARG(UTF, w, rangebegin));
13846 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13847 ckWARN2reg(RExC_parse,
13848 "False [] range \"%"UTF8f"\"",
13849 UTF8fARG(UTF, w, rangebegin));
13850 (void)ReREFCNT_inc(RExC_rx_sv);
13851 cp_list = add_cp_to_invlist(cp_list, '-');
13852 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13857 range = 0; /* this was not a true range */
13858 element_count += 2; /* So counts for three values */
13861 classnum = namedclass_to_classnum(namedclass);
13863 if (LOC && namedclass < ANYOF_POSIXL_MAX
13864 #ifndef HAS_ISASCII
13865 && classnum != _CC_ASCII
13868 /* What the Posix classes (like \w, [:space:]) match in locale
13869 * isn't knowable under locale until actual match time. Room
13870 * must be reserved (one time per outer bracketed class) to
13871 * store such classes. The space will contain a bit for each
13872 * named class that is to be matched against. This isn't
13873 * needed for \p{} and pseudo-classes, as they are not affected
13874 * by locale, and hence are dealt with separately */
13875 if (! need_class) {
13878 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13881 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13883 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13884 ANYOF_POSIXL_ZERO(ret);
13887 /* Coverity thinks it is possible for this to be negative; both
13888 * jhi and khw think it's not, but be safer */
13889 assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13890 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13892 /* See if it already matches the complement of this POSIX
13894 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13895 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13899 posixl_matches_all = TRUE;
13900 break; /* No need to continue. Since it matches both
13901 e.g., \w and \W, it matches everything, and the
13902 bracketed class can be optimized into qr/./s */
13905 /* Add this class to those that should be checked at runtime */
13906 ANYOF_POSIXL_SET(ret, namedclass);
13908 /* The above-Latin1 characters are not subject to locale rules.
13909 * Just add them, in the second pass, to the
13910 * unconditionally-matched list */
13912 SV* scratch_list = NULL;
13914 /* Get the list of the above-Latin1 code points this
13916 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13917 PL_XPosix_ptrs[classnum],
13919 /* Odd numbers are complements, like
13920 * NDIGIT, NASCII, ... */
13921 namedclass % 2 != 0,
13923 /* Checking if 'cp_list' is NULL first saves an extra
13924 * clone. Its reference count will be decremented at the
13925 * next union, etc, or if this is the only instance, at the
13926 * end of the routine */
13928 cp_list = scratch_list;
13931 _invlist_union(cp_list, scratch_list, &cp_list);
13932 SvREFCNT_dec_NN(scratch_list);
13934 continue; /* Go get next character */
13937 else if (! SIZE_ONLY) {
13939 /* Here, not in pass1 (in that pass we skip calculating the
13940 * contents of this class), and is /l, or is a POSIX class for
13941 * which /l doesn't matter (or is a Unicode property, which is
13942 * skipped here). */
13943 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13944 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13946 /* Here, should be \h, \H, \v, or \V. None of /d, /i
13947 * nor /l make a difference in what these match,
13948 * therefore we just add what they match to cp_list. */
13949 if (classnum != _CC_VERTSPACE) {
13950 assert( namedclass == ANYOF_HORIZWS
13951 || namedclass == ANYOF_NHORIZWS);
13953 /* It turns out that \h is just a synonym for
13955 classnum = _CC_BLANK;
13958 _invlist_union_maybe_complement_2nd(
13960 PL_XPosix_ptrs[classnum],
13961 namedclass % 2 != 0, /* Complement if odd
13962 (NHORIZWS, NVERTWS)
13967 else { /* Garden variety class. If is NASCII, NDIGIT, ...
13968 complement and use nposixes */
13969 SV** posixes_ptr = namedclass % 2 == 0
13972 SV** source_ptr = &PL_XPosix_ptrs[classnum];
13973 _invlist_union_maybe_complement_2nd(
13976 namedclass % 2 != 0,
13979 continue; /* Go get next character */
13981 } /* end of namedclass \blah */
13983 /* Here, we have a single value. If 'range' is set, it is the ending
13984 * of a range--check its validity. Later, we will handle each
13985 * individual code point in the range. If 'range' isn't set, this
13986 * could be the beginning of a range, so check for that by looking
13987 * ahead to see if the next real character to be processed is the range
13988 * indicator--the minus sign */
13991 RExC_parse = regpatws(pRExC_state, RExC_parse,
13992 FALSE /* means don't recognize comments */ );
13996 if (prevvalue > value) /* b-a */ {
13997 const int w = RExC_parse - rangebegin;
13999 "Invalid [] range \"%"UTF8f"\"",
14000 UTF8fARG(UTF, w, rangebegin));
14001 range = 0; /* not a valid range */
14005 prevvalue = value; /* save the beginning of the potential range */
14006 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14007 && *RExC_parse == '-')
14009 char* next_char_ptr = RExC_parse + 1;
14010 if (skip_white) { /* Get the next real char after the '-' */
14011 next_char_ptr = regpatws(pRExC_state,
14013 FALSE); /* means don't recognize
14017 /* If the '-' is at the end of the class (just before the ']',
14018 * it is a literal minus; otherwise it is a range */
14019 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14020 RExC_parse = next_char_ptr;
14022 /* a bad range like \w-, [:word:]- ? */
14023 if (namedclass > OOB_NAMEDCLASS) {
14024 if (strict || ckWARN(WARN_REGEXP)) {
14026 RExC_parse >= rangebegin ?
14027 RExC_parse - rangebegin : 0;
14029 vFAIL4("False [] range \"%*.*s\"",
14034 "False [] range \"%*.*s\"",
14039 cp_list = add_cp_to_invlist(cp_list, '-');
14043 range = 1; /* yeah, it's a range! */
14044 continue; /* but do it the next time */
14049 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14052 /* non-Latin1 code point implies unicode semantics. Must be set in
14053 * pass1 so is there for the whole of pass 2 */
14055 RExC_uni_semantics = 1;
14058 /* Ready to process either the single value, or the completed range.
14059 * For single-valued non-inverted ranges, we consider the possibility
14060 * of multi-char folds. (We made a conscious decision to not do this
14061 * for the other cases because it can often lead to non-intuitive
14062 * results. For example, you have the peculiar case that:
14063 * "s s" =~ /^[^\xDF]+$/i => Y
14064 * "ss" =~ /^[^\xDF]+$/i => N
14066 * See [perl #89750] */
14067 if (FOLD && allow_multi_folds && value == prevvalue) {
14068 if (value == LATIN_SMALL_LETTER_SHARP_S
14069 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14072 /* Here <value> is indeed a multi-char fold. Get what it is */
14074 U8 foldbuf[UTF8_MAXBYTES_CASE];
14077 UV folded = _to_uni_fold_flags(
14081 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14082 ? FOLD_FLAGS_NOMIX_ASCII
14086 /* Here, <folded> should be the first character of the
14087 * multi-char fold of <value>, with <foldbuf> containing the
14088 * whole thing. But, if this fold is not allowed (because of
14089 * the flags), <fold> will be the same as <value>, and should
14090 * be processed like any other character, so skip the special
14092 if (folded != value) {
14094 /* Skip if we are recursed, currently parsing the class
14095 * again. Otherwise add this character to the list of
14096 * multi-char folds. */
14097 if (! RExC_in_multi_char_class) {
14098 AV** this_array_ptr;
14100 STRLEN cp_count = utf8_length(foldbuf,
14101 foldbuf + foldlen);
14102 SV* multi_fold = sv_2mortal(newSVpvs(""));
14104 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14107 if (! multi_char_matches) {
14108 multi_char_matches = newAV();
14111 /* <multi_char_matches> is actually an array of arrays.
14112 * There will be one or two top-level elements: [2],
14113 * and/or [3]. The [2] element is an array, each
14114 * element thereof is a character which folds to TWO
14115 * characters; [3] is for folds to THREE characters.
14116 * (Unicode guarantees a maximum of 3 characters in any
14117 * fold.) When we rewrite the character class below,
14118 * we will do so such that the longest folds are
14119 * written first, so that it prefers the longest
14120 * matching strings first. This is done even if it
14121 * turns out that any quantifier is non-greedy, out of
14122 * programmer laziness. Tom Christiansen has agreed
14123 * that this is ok. This makes the test for the
14124 * ligature 'ffi' come before the test for 'ff' */
14125 if (av_exists(multi_char_matches, cp_count)) {
14126 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14128 this_array = *this_array_ptr;
14131 this_array = newAV();
14132 av_store(multi_char_matches, cp_count,
14135 av_push(this_array, multi_fold);
14138 /* This element should not be processed further in this
14141 value = save_value;
14142 prevvalue = save_prevvalue;
14148 /* Deal with this element of the class */
14151 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14154 SV* this_range = _new_invlist(1);
14155 _append_range_to_invlist(this_range, prevvalue, value);
14157 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14158 * If this range was specified using something like 'i-j', we want
14159 * to include only the 'i' and the 'j', and not anything in
14160 * between, so exclude non-ASCII, non-alphabetics from it.
14161 * However, if the range was specified with something like
14162 * [\x89-\x91] or [\x89-j], all code points within it should be
14163 * included. literal_endpoint==2 means both ends of the range used
14164 * a literal character, not \x{foo} */
14165 if (literal_endpoint == 2
14166 && ((prevvalue >= 'a' && value <= 'z')
14167 || (prevvalue >= 'A' && value <= 'Z')))
14169 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14172 /* Since this above only contains ascii, the intersection of it
14173 * with anything will still yield only ascii */
14174 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14177 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14178 literal_endpoint = 0;
14182 range = 0; /* this range (if it was one) is done now */
14183 } /* End of loop through all the text within the brackets */
14185 /* If anything in the class expands to more than one character, we have to
14186 * deal with them by building up a substitute parse string, and recursively
14187 * calling reg() on it, instead of proceeding */
14188 if (multi_char_matches) {
14189 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14192 char *save_end = RExC_end;
14193 char *save_parse = RExC_parse;
14194 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14199 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14200 because too confusing */
14202 sv_catpv(substitute_parse, "(?:");
14206 /* Look at the longest folds first */
14207 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14209 if (av_exists(multi_char_matches, cp_count)) {
14210 AV** this_array_ptr;
14213 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14215 while ((this_sequence = av_pop(*this_array_ptr)) !=
14218 if (! first_time) {
14219 sv_catpv(substitute_parse, "|");
14221 first_time = FALSE;
14223 sv_catpv(substitute_parse, SvPVX(this_sequence));
14228 /* If the character class contains anything else besides these
14229 * multi-character folds, have to include it in recursive parsing */
14230 if (element_count) {
14231 sv_catpv(substitute_parse, "|[");
14232 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14233 sv_catpv(substitute_parse, "]");
14236 sv_catpv(substitute_parse, ")");
14239 /* This is a way to get the parse to skip forward a whole named
14240 * sequence instead of matching the 2nd character when it fails the
14242 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14246 RExC_parse = SvPV(substitute_parse, len);
14247 RExC_end = RExC_parse + len;
14248 RExC_in_multi_char_class = 1;
14249 RExC_emit = (regnode *)orig_emit;
14251 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14253 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14255 RExC_parse = save_parse;
14256 RExC_end = save_end;
14257 RExC_in_multi_char_class = 0;
14258 SvREFCNT_dec_NN(multi_char_matches);
14262 /* Here, we've gone through the entire class and dealt with multi-char
14263 * folds. We are now in a position that we can do some checks to see if we
14264 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14265 * Currently we only do two checks:
14266 * 1) is in the unlikely event that the user has specified both, eg. \w and
14267 * \W under /l, then the class matches everything. (This optimization
14268 * is done only to make the optimizer code run later work.)
14269 * 2) if the character class contains only a single element (including a
14270 * single range), we see if there is an equivalent node for it.
14271 * Other checks are possible */
14272 if (! ret_invlist /* Can't optimize if returning the constructed
14274 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14279 if (UNLIKELY(posixl_matches_all)) {
14282 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14283 \w or [:digit:] or \p{foo}
14286 /* All named classes are mapped into POSIXish nodes, with its FLAG
14287 * argument giving which class it is */
14288 switch ((I32)namedclass) {
14289 case ANYOF_UNIPROP:
14292 /* These don't depend on the charset modifiers. They always
14293 * match under /u rules */
14294 case ANYOF_NHORIZWS:
14295 case ANYOF_HORIZWS:
14296 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14299 case ANYOF_NVERTWS:
14304 /* The actual POSIXish node for all the rest depends on the
14305 * charset modifier. The ones in the first set depend only on
14306 * ASCII or, if available on this platform, locale */
14310 op = (LOC) ? POSIXL : POSIXA;
14321 /* under /a could be alpha */
14323 if (ASCII_RESTRICTED) {
14324 namedclass = ANYOF_ALPHA + (namedclass % 2);
14332 /* The rest have more possibilities depending on the charset.
14333 * We take advantage of the enum ordering of the charset
14334 * modifiers to get the exact node type, */
14336 op = POSIXD + get_regex_charset(RExC_flags);
14337 if (op > POSIXA) { /* /aa is same as /a */
14342 /* The odd numbered ones are the complements of the
14343 * next-lower even number one */
14344 if (namedclass % 2 == 1) {
14348 arg = namedclass_to_classnum(namedclass);
14352 else if (value == prevvalue) {
14354 /* Here, the class consists of just a single code point */
14357 if (! LOC && value == '\n') {
14358 op = REG_ANY; /* Optimize [^\n] */
14359 *flagp |= HASWIDTH|SIMPLE;
14363 else if (value < 256 || UTF) {
14365 /* Optimize a single value into an EXACTish node, but not if it
14366 * would require converting the pattern to UTF-8. */
14367 op = compute_EXACTish(pRExC_state);
14369 } /* Otherwise is a range */
14370 else if (! LOC) { /* locale could vary these */
14371 if (prevvalue == '0') {
14372 if (value == '9') {
14377 else if (prevvalue == 'A') {
14380 && literal_endpoint == 2
14383 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14387 else if (prevvalue == 'a') {
14390 && literal_endpoint == 2
14393 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14399 /* Here, we have changed <op> away from its initial value iff we found
14400 * an optimization */
14403 /* Throw away this ANYOF regnode, and emit the calculated one,
14404 * which should correspond to the beginning, not current, state of
14406 const char * cur_parse = RExC_parse;
14407 RExC_parse = (char *)orig_parse;
14411 /* To get locale nodes to not use the full ANYOF size would
14412 * require moving the code above that writes the portions
14413 * of it that aren't in other nodes to after this point.
14414 * e.g. ANYOF_POSIXL_SET */
14415 RExC_size = orig_size;
14419 RExC_emit = (regnode *)orig_emit;
14420 if (PL_regkind[op] == POSIXD) {
14421 if (op == POSIXL) {
14422 RExC_contains_locale = 1;
14425 op += NPOSIXD - POSIXD;
14430 ret = reg_node(pRExC_state, op);
14432 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14436 *flagp |= HASWIDTH|SIMPLE;
14438 else if (PL_regkind[op] == EXACT) {
14439 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14440 TRUE /* downgradable to EXACT */
14444 RExC_parse = (char *) cur_parse;
14446 SvREFCNT_dec(posixes);
14447 SvREFCNT_dec(nposixes);
14448 SvREFCNT_dec(cp_list);
14449 SvREFCNT_dec(cp_foldable_list);
14456 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14458 /* If folding, we calculate all characters that could fold to or from the
14459 * ones already on the list */
14460 if (cp_foldable_list) {
14462 UV start, end; /* End points of code point ranges */
14464 SV* fold_intersection = NULL;
14467 /* Our calculated list will be for Unicode rules. For locale
14468 * matching, we have to keep a separate list that is consulted at
14469 * runtime only when the locale indicates Unicode rules. For
14470 * non-locale, we just use to the general list */
14472 use_list = &only_utf8_locale_list;
14475 use_list = &cp_list;
14478 /* Only the characters in this class that participate in folds need
14479 * be checked. Get the intersection of this class and all the
14480 * possible characters that are foldable. This can quickly narrow
14481 * down a large class */
14482 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14483 &fold_intersection);
14485 /* The folds for all the Latin1 characters are hard-coded into this
14486 * program, but we have to go out to disk to get the others. */
14487 if (invlist_highest(cp_foldable_list) >= 256) {
14489 /* This is a hash that for a particular fold gives all
14490 * characters that are involved in it */
14491 if (! PL_utf8_foldclosures) {
14492 _load_PL_utf8_foldclosures();
14496 /* Now look at the foldable characters in this class individually */
14497 invlist_iterinit(fold_intersection);
14498 while (invlist_iternext(fold_intersection, &start, &end)) {
14501 /* Look at every character in the range */
14502 for (j = start; j <= end; j++) {
14503 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14509 if (IS_IN_SOME_FOLD_L1(j)) {
14511 /* ASCII is always matched; non-ASCII is matched
14512 * only under Unicode rules (which could happen
14513 * under /l if the locale is a UTF-8 one */
14514 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14515 *use_list = add_cp_to_invlist(*use_list,
14516 PL_fold_latin1[j]);
14520 add_cp_to_invlist(depends_list,
14521 PL_fold_latin1[j]);
14525 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14526 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14528 add_above_Latin1_folds(pRExC_state,
14535 /* Here is an above Latin1 character. We don't have the
14536 * rules hard-coded for it. First, get its fold. This is
14537 * the simple fold, as the multi-character folds have been
14538 * handled earlier and separated out */
14539 _to_uni_fold_flags(j, foldbuf, &foldlen,
14540 (ASCII_FOLD_RESTRICTED)
14541 ? FOLD_FLAGS_NOMIX_ASCII
14544 /* Single character fold of above Latin1. Add everything in
14545 * its fold closure to the list that this node should match.
14546 * The fold closures data structure is a hash with the keys
14547 * being the UTF-8 of every character that is folded to, like
14548 * 'k', and the values each an array of all code points that
14549 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14550 * Multi-character folds are not included */
14551 if ((listp = hv_fetch(PL_utf8_foldclosures,
14552 (char *) foldbuf, foldlen, FALSE)))
14554 AV* list = (AV*) *listp;
14556 for (k = 0; k <= av_tindex(list); k++) {
14557 SV** c_p = av_fetch(list, k, FALSE);
14563 /* /aa doesn't allow folds between ASCII and non- */
14564 if ((ASCII_FOLD_RESTRICTED
14565 && (isASCII(c) != isASCII(j))))
14570 /* Folds under /l which cross the 255/256 boundary
14571 * are added to a separate list. (These are valid
14572 * only when the locale is UTF-8.) */
14573 if (c < 256 && LOC) {
14574 *use_list = add_cp_to_invlist(*use_list, c);
14578 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14580 cp_list = add_cp_to_invlist(cp_list, c);
14583 /* Similarly folds involving non-ascii Latin1
14584 * characters under /d are added to their list */
14585 depends_list = add_cp_to_invlist(depends_list,
14592 SvREFCNT_dec_NN(fold_intersection);
14595 /* Now that we have finished adding all the folds, there is no reason
14596 * to keep the foldable list separate */
14597 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14598 SvREFCNT_dec_NN(cp_foldable_list);
14601 /* And combine the result (if any) with any inversion list from posix
14602 * classes. The lists are kept separate up to now because we don't want to
14603 * fold the classes (folding of those is automatically handled by the swash
14604 * fetching code) */
14605 if (posixes || nposixes) {
14606 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14607 /* Under /a and /aa, nothing above ASCII matches these */
14608 _invlist_intersection(posixes,
14609 PL_XPosix_ptrs[_CC_ASCII],
14613 if (DEPENDS_SEMANTICS) {
14614 /* Under /d, everything in the upper half of the Latin1 range
14615 * matches these complements */
14616 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14618 else if (AT_LEAST_ASCII_RESTRICTED) {
14619 /* Under /a and /aa, everything above ASCII matches these
14621 _invlist_union_complement_2nd(nposixes,
14622 PL_XPosix_ptrs[_CC_ASCII],
14626 _invlist_union(posixes, nposixes, &posixes);
14627 SvREFCNT_dec_NN(nposixes);
14630 posixes = nposixes;
14633 if (! DEPENDS_SEMANTICS) {
14635 _invlist_union(cp_list, posixes, &cp_list);
14636 SvREFCNT_dec_NN(posixes);
14643 /* Under /d, we put into a separate list the Latin1 things that
14644 * match only when the target string is utf8 */
14645 SV* nonascii_but_latin1_properties = NULL;
14646 _invlist_intersection(posixes, PL_UpperLatin1,
14647 &nonascii_but_latin1_properties);
14648 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14651 _invlist_union(cp_list, posixes, &cp_list);
14652 SvREFCNT_dec_NN(posixes);
14658 if (depends_list) {
14659 _invlist_union(depends_list, nonascii_but_latin1_properties,
14661 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14664 depends_list = nonascii_but_latin1_properties;
14669 /* And combine the result (if any) with any inversion list from properties.
14670 * The lists are kept separate up to now so that we can distinguish the two
14671 * in regards to matching above-Unicode. A run-time warning is generated
14672 * if a Unicode property is matched against a non-Unicode code point. But,
14673 * we allow user-defined properties to match anything, without any warning,
14674 * and we also suppress the warning if there is a portion of the character
14675 * class that isn't a Unicode property, and which matches above Unicode, \W
14676 * or [\x{110000}] for example.
14677 * (Note that in this case, unlike the Posix one above, there is no
14678 * <depends_list>, because having a Unicode property forces Unicode
14683 /* If it matters to the final outcome, see if a non-property
14684 * component of the class matches above Unicode. If so, the
14685 * warning gets suppressed. This is true even if just a single
14686 * such code point is specified, as though not strictly correct if
14687 * another such code point is matched against, the fact that they
14688 * are using above-Unicode code points indicates they should know
14689 * the issues involved */
14691 warn_super = ! (invert
14692 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14695 _invlist_union(properties, cp_list, &cp_list);
14696 SvREFCNT_dec_NN(properties);
14699 cp_list = properties;
14703 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14707 /* Here, we have calculated what code points should be in the character
14710 * Now we can see about various optimizations. Fold calculation (which we
14711 * did above) needs to take place before inversion. Otherwise /[^k]/i
14712 * would invert to include K, which under /i would match k, which it
14713 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14714 * folded until runtime */
14716 /* If we didn't do folding, it's because some information isn't available
14717 * until runtime; set the run-time fold flag for these. (We don't have to
14718 * worry about properties folding, as that is taken care of by the swash
14719 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14720 * locales, or the class matches at least one 0-255 range code point */
14722 if (only_utf8_locale_list) {
14723 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14725 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14728 invlist_iterinit(cp_list);
14729 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14730 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14732 invlist_iterfinish(cp_list);
14736 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14737 * at compile time. Besides not inverting folded locale now, we can't
14738 * invert if there are things such as \w, which aren't known until runtime
14742 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14744 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14746 _invlist_invert(cp_list);
14748 /* Any swash can't be used as-is, because we've inverted things */
14750 SvREFCNT_dec_NN(swash);
14754 /* Clear the invert flag since have just done it here */
14759 *ret_invlist = cp_list;
14760 SvREFCNT_dec(swash);
14762 /* Discard the generated node */
14764 RExC_size = orig_size;
14767 RExC_emit = orig_emit;
14772 /* Some character classes are equivalent to other nodes. Such nodes take
14773 * up less room and generally fewer operations to execute than ANYOF nodes.
14774 * Above, we checked for and optimized into some such equivalents for
14775 * certain common classes that are easy to test. Getting to this point in
14776 * the code means that the class didn't get optimized there. Since this
14777 * code is only executed in Pass 2, it is too late to save space--it has
14778 * been allocated in Pass 1, and currently isn't given back. But turning
14779 * things into an EXACTish node can allow the optimizer to join it to any
14780 * adjacent such nodes. And if the class is equivalent to things like /./,
14781 * expensive run-time swashes can be avoided. Now that we have more
14782 * complete information, we can find things necessarily missed by the
14783 * earlier code. I (khw) am not sure how much to look for here. It would
14784 * be easy, but perhaps too slow, to check any candidates against all the
14785 * node types they could possibly match using _invlistEQ(). */
14790 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14791 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14793 /* We don't optimize if we are supposed to make sure all non-Unicode
14794 * code points raise a warning, as only ANYOF nodes have this check.
14796 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14799 U8 op = END; /* The optimzation node-type */
14800 const char * cur_parse= RExC_parse;
14802 invlist_iterinit(cp_list);
14803 if (! invlist_iternext(cp_list, &start, &end)) {
14805 /* Here, the list is empty. This happens, for example, when a
14806 * Unicode property is the only thing in the character class, and
14807 * it doesn't match anything. (perluniprops.pod notes such
14810 *flagp |= HASWIDTH|SIMPLE;
14812 else if (start == end) { /* The range is a single code point */
14813 if (! invlist_iternext(cp_list, &start, &end)
14815 /* Don't do this optimization if it would require changing
14816 * the pattern to UTF-8 */
14817 && (start < 256 || UTF))
14819 /* Here, the list contains a single code point. Can optimize
14820 * into an EXACTish node */
14829 /* A locale node under folding with one code point can be
14830 * an EXACTFL, as its fold won't be calculated until
14836 /* Here, we are generally folding, but there is only one
14837 * code point to match. If we have to, we use an EXACT
14838 * node, but it would be better for joining with adjacent
14839 * nodes in the optimization pass if we used the same
14840 * EXACTFish node that any such are likely to be. We can
14841 * do this iff the code point doesn't participate in any
14842 * folds. For example, an EXACTF of a colon is the same as
14843 * an EXACT one, since nothing folds to or from a colon. */
14845 if (IS_IN_SOME_FOLD_L1(value)) {
14850 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14855 /* If we haven't found the node type, above, it means we
14856 * can use the prevailing one */
14858 op = compute_EXACTish(pRExC_state);
14863 else if (start == 0) {
14864 if (end == UV_MAX) {
14866 *flagp |= HASWIDTH|SIMPLE;
14869 else if (end == '\n' - 1
14870 && invlist_iternext(cp_list, &start, &end)
14871 && start == '\n' + 1 && end == UV_MAX)
14874 *flagp |= HASWIDTH|SIMPLE;
14878 invlist_iterfinish(cp_list);
14881 RExC_parse = (char *)orig_parse;
14882 RExC_emit = (regnode *)orig_emit;
14884 ret = reg_node(pRExC_state, op);
14886 RExC_parse = (char *)cur_parse;
14888 if (PL_regkind[op] == EXACT) {
14889 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14890 TRUE /* downgradable to EXACT */
14894 SvREFCNT_dec_NN(cp_list);
14899 /* Here, <cp_list> contains all the code points we can determine at
14900 * compile time that match under all conditions. Go through it, and
14901 * for things that belong in the bitmap, put them there, and delete from
14902 * <cp_list>. While we are at it, see if everything above 255 is in the
14903 * list, and if so, set a flag to speed up execution */
14905 populate_ANYOF_from_invlist(ret, &cp_list);
14908 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14911 /* Here, the bitmap has been populated with all the Latin1 code points that
14912 * always match. Can now add to the overall list those that match only
14913 * when the target string is UTF-8 (<depends_list>). */
14914 if (depends_list) {
14916 _invlist_union(cp_list, depends_list, &cp_list);
14917 SvREFCNT_dec_NN(depends_list);
14920 cp_list = depends_list;
14922 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14925 /* If there is a swash and more than one element, we can't use the swash in
14926 * the optimization below. */
14927 if (swash && element_count > 1) {
14928 SvREFCNT_dec_NN(swash);
14932 set_ANYOF_arg(pRExC_state, ret, cp_list,
14933 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14935 only_utf8_locale_list,
14936 swash, has_user_defined_property);
14938 *flagp |= HASWIDTH|SIMPLE;
14940 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14941 RExC_contains_locale = 1;
14947 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14950 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14951 regnode* const node,
14953 SV* const runtime_defns,
14954 SV* const only_utf8_locale_list,
14956 const bool has_user_defined_property)
14958 /* Sets the arg field of an ANYOF-type node 'node', using information about
14959 * the node passed-in. If there is nothing outside the node's bitmap, the
14960 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14961 * the count returned by add_data(), having allocated and stored an array,
14962 * av, that that count references, as follows:
14963 * av[0] stores the character class description in its textual form.
14964 * This is used later (regexec.c:Perl_regclass_swash()) to
14965 * initialize the appropriate swash, and is also useful for dumping
14966 * the regnode. This is set to &PL_sv_undef if the textual
14967 * description is not needed at run-time (as happens if the other
14968 * elements completely define the class)
14969 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14970 * computed from av[0]. But if no further computation need be done,
14971 * the swash is stored here now (and av[0] is &PL_sv_undef).
14972 * av[2] stores the inversion list of code points that match only if the
14973 * current locale is UTF-8
14974 * av[3] stores the cp_list inversion list for use in addition or instead
14975 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14976 * (Otherwise everything needed is already in av[0] and av[1])
14977 * av[4] is set if any component of the class is from a user-defined
14978 * property; used only if av[3] exists */
14982 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14984 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14985 assert(! (ANYOF_FLAGS(node)
14986 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14987 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14990 AV * const av = newAV();
14993 assert(ANYOF_FLAGS(node)
14994 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14996 av_store(av, 0, (runtime_defns)
14997 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15000 av_store(av, 1, swash);
15001 SvREFCNT_dec_NN(cp_list);
15004 av_store(av, 1, &PL_sv_undef);
15006 av_store(av, 3, cp_list);
15007 av_store(av, 4, newSVuv(has_user_defined_property));
15011 if (only_utf8_locale_list) {
15012 av_store(av, 2, only_utf8_locale_list);
15015 av_store(av, 2, &PL_sv_undef);
15018 rv = newRV_noinc(MUTABLE_SV(av));
15019 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15020 RExC_rxi->data->data[n] = (void*)rv;
15026 /* reg_skipcomment()
15028 Absorbs an /x style # comment from the input stream,
15029 returning a pointer to the first character beyond the comment, or if the
15030 comment terminates the pattern without anything following it, this returns
15031 one past the final character of the pattern (in other words, RExC_end) and
15032 sets the REG_RUN_ON_COMMENT_SEEN flag.
15034 Note it's the callers responsibility to ensure that we are
15035 actually in /x mode
15039 PERL_STATIC_INLINE char*
15040 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15042 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15046 while (p < RExC_end) {
15047 if (*(++p) == '\n') {
15052 /* we ran off the end of the pattern without ending the comment, so we have
15053 * to add an \n when wrapping */
15054 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15060 Advances the parse position, and optionally absorbs
15061 "whitespace" from the inputstream.
15063 Without /x "whitespace" means (?#...) style comments only,
15064 with /x this means (?#...) and # comments and whitespace proper.
15066 Returns the RExC_parse point from BEFORE the scan occurs.
15068 This is the /x friendly way of saying RExC_parse++.
15072 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15074 char* const retval = RExC_parse++;
15076 PERL_ARGS_ASSERT_NEXTCHAR;
15079 if (RExC_end - RExC_parse >= 3
15080 && *RExC_parse == '('
15081 && RExC_parse[1] == '?'
15082 && RExC_parse[2] == '#')
15084 while (*RExC_parse != ')') {
15085 if (RExC_parse == RExC_end)
15086 FAIL("Sequence (?#... not terminated");
15092 if (RExC_flags & RXf_PMf_EXTENDED) {
15093 char * p = regpatws(pRExC_state, RExC_parse,
15094 TRUE); /* means recognize comments */
15095 if (p != RExC_parse) {
15105 - reg_node - emit a node
15107 STATIC regnode * /* Location. */
15108 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15112 regnode * const ret = RExC_emit;
15113 GET_RE_DEBUG_FLAGS_DECL;
15115 PERL_ARGS_ASSERT_REG_NODE;
15118 SIZE_ALIGN(RExC_size);
15122 if (RExC_emit >= RExC_emit_bound)
15123 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15124 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15126 NODE_ALIGN_FILL(ret);
15128 FILL_ADVANCE_NODE(ptr, op);
15129 #ifdef RE_TRACK_PATTERN_OFFSETS
15130 if (RExC_offsets) { /* MJD */
15132 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15133 "reg_node", __LINE__,
15135 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15136 ? "Overwriting end of array!\n" : "OK",
15137 (UV)(RExC_emit - RExC_emit_start),
15138 (UV)(RExC_parse - RExC_start),
15139 (UV)RExC_offsets[0]));
15140 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15148 - reganode - emit a node with an argument
15150 STATIC regnode * /* Location. */
15151 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15155 regnode * const ret = RExC_emit;
15156 GET_RE_DEBUG_FLAGS_DECL;
15158 PERL_ARGS_ASSERT_REGANODE;
15161 SIZE_ALIGN(RExC_size);
15166 assert(2==regarglen[op]+1);
15168 Anything larger than this has to allocate the extra amount.
15169 If we changed this to be:
15171 RExC_size += (1 + regarglen[op]);
15173 then it wouldn't matter. Its not clear what side effect
15174 might come from that so its not done so far.
15179 if (RExC_emit >= RExC_emit_bound)
15180 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15181 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15183 NODE_ALIGN_FILL(ret);
15185 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15186 #ifdef RE_TRACK_PATTERN_OFFSETS
15187 if (RExC_offsets) { /* MJD */
15189 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15193 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15194 "Overwriting end of array!\n" : "OK",
15195 (UV)(RExC_emit - RExC_emit_start),
15196 (UV)(RExC_parse - RExC_start),
15197 (UV)RExC_offsets[0]));
15198 Set_Cur_Node_Offset;
15206 - reguni - emit (if appropriate) a Unicode character
15208 PERL_STATIC_INLINE STRLEN
15209 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15213 PERL_ARGS_ASSERT_REGUNI;
15215 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15219 - reginsert - insert an operator in front of already-emitted operand
15221 * Means relocating the operand.
15224 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15230 const int offset = regarglen[(U8)op];
15231 const int size = NODE_STEP_REGNODE + offset;
15232 GET_RE_DEBUG_FLAGS_DECL;
15234 PERL_ARGS_ASSERT_REGINSERT;
15235 PERL_UNUSED_CONTEXT;
15236 PERL_UNUSED_ARG(depth);
15237 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15238 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15247 if (RExC_open_parens) {
15249 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15250 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15251 if ( RExC_open_parens[paren] >= opnd ) {
15252 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15253 RExC_open_parens[paren] += size;
15255 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15257 if ( RExC_close_parens[paren] >= opnd ) {
15258 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15259 RExC_close_parens[paren] += size;
15261 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15266 while (src > opnd) {
15267 StructCopy(--src, --dst, regnode);
15268 #ifdef RE_TRACK_PATTERN_OFFSETS
15269 if (RExC_offsets) { /* MJD 20010112 */
15271 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15275 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15276 ? "Overwriting end of array!\n" : "OK",
15277 (UV)(src - RExC_emit_start),
15278 (UV)(dst - RExC_emit_start),
15279 (UV)RExC_offsets[0]));
15280 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15281 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15287 place = opnd; /* Op node, where operand used to be. */
15288 #ifdef RE_TRACK_PATTERN_OFFSETS
15289 if (RExC_offsets) { /* MJD */
15291 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15295 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15296 ? "Overwriting end of array!\n" : "OK",
15297 (UV)(place - RExC_emit_start),
15298 (UV)(RExC_parse - RExC_start),
15299 (UV)RExC_offsets[0]));
15300 Set_Node_Offset(place, RExC_parse);
15301 Set_Node_Length(place, 1);
15304 src = NEXTOPER(place);
15305 FILL_ADVANCE_NODE(place, op);
15306 Zero(src, offset, regnode);
15310 - regtail - set the next-pointer at the end of a node chain of p to val.
15311 - SEE ALSO: regtail_study
15313 /* TODO: All three parms should be const */
15315 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15316 const regnode *val,U32 depth)
15320 GET_RE_DEBUG_FLAGS_DECL;
15322 PERL_ARGS_ASSERT_REGTAIL;
15324 PERL_UNUSED_ARG(depth);
15330 /* Find last node. */
15333 regnode * const temp = regnext(scan);
15335 SV * const mysv=sv_newmortal();
15336 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15337 regprop(RExC_rx, mysv, scan, NULL);
15338 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15339 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15340 (temp == NULL ? "->" : ""),
15341 (temp == NULL ? PL_reg_name[OP(val)] : "")
15349 if (reg_off_by_arg[OP(scan)]) {
15350 ARG_SET(scan, val - scan);
15353 NEXT_OFF(scan) = val - scan;
15359 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15360 - Look for optimizable sequences at the same time.
15361 - currently only looks for EXACT chains.
15363 This is experimental code. The idea is to use this routine to perform
15364 in place optimizations on branches and groups as they are constructed,
15365 with the long term intention of removing optimization from study_chunk so
15366 that it is purely analytical.
15368 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15369 to control which is which.
15372 /* TODO: All four parms should be const */
15375 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15376 const regnode *val,U32 depth)
15381 #ifdef EXPERIMENTAL_INPLACESCAN
15384 GET_RE_DEBUG_FLAGS_DECL;
15386 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15392 /* Find last node. */
15396 regnode * const temp = regnext(scan);
15397 #ifdef EXPERIMENTAL_INPLACESCAN
15398 if (PL_regkind[OP(scan)] == EXACT) {
15399 bool unfolded_multi_char; /* Unexamined in this routine */
15400 if (join_exact(pRExC_state, scan, &min,
15401 &unfolded_multi_char, 1, val, depth+1))
15406 switch (OP(scan)) {
15409 case EXACTFA_NO_TRIE:
15414 if( exact == PSEUDO )
15416 else if ( exact != OP(scan) )
15425 SV * const mysv=sv_newmortal();
15426 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15427 regprop(RExC_rx, mysv, scan, NULL);
15428 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15429 SvPV_nolen_const(mysv),
15430 REG_NODE_NUM(scan),
15431 PL_reg_name[exact]);
15438 SV * const mysv_val=sv_newmortal();
15439 DEBUG_PARSE_MSG("");
15440 regprop(RExC_rx, mysv_val, val, NULL);
15441 PerlIO_printf(Perl_debug_log,
15442 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15443 SvPV_nolen_const(mysv_val),
15444 (IV)REG_NODE_NUM(val),
15448 if (reg_off_by_arg[OP(scan)]) {
15449 ARG_SET(scan, val - scan);
15452 NEXT_OFF(scan) = val - scan;
15460 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15465 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15470 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15472 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15473 if (flags & (1<<bit)) {
15474 if (!set++ && lead)
15475 PerlIO_printf(Perl_debug_log, "%s",lead);
15476 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15481 PerlIO_printf(Perl_debug_log, "\n");
15483 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15488 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15494 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15496 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15497 if (flags & (1<<bit)) {
15498 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15501 if (!set++ && lead)
15502 PerlIO_printf(Perl_debug_log, "%s",lead);
15503 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15506 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15507 if (!set++ && lead) {
15508 PerlIO_printf(Perl_debug_log, "%s",lead);
15511 case REGEX_UNICODE_CHARSET:
15512 PerlIO_printf(Perl_debug_log, "UNICODE");
15514 case REGEX_LOCALE_CHARSET:
15515 PerlIO_printf(Perl_debug_log, "LOCALE");
15517 case REGEX_ASCII_RESTRICTED_CHARSET:
15518 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15520 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15521 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15524 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15530 PerlIO_printf(Perl_debug_log, "\n");
15532 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15538 Perl_regdump(pTHX_ const regexp *r)
15542 SV * const sv = sv_newmortal();
15543 SV *dsv= sv_newmortal();
15544 RXi_GET_DECL(r,ri);
15545 GET_RE_DEBUG_FLAGS_DECL;
15547 PERL_ARGS_ASSERT_REGDUMP;
15549 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15551 /* Header fields of interest. */
15552 if (r->anchored_substr) {
15553 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15554 RE_SV_DUMPLEN(r->anchored_substr), 30);
15555 PerlIO_printf(Perl_debug_log,
15556 "anchored %s%s at %"IVdf" ",
15557 s, RE_SV_TAIL(r->anchored_substr),
15558 (IV)r->anchored_offset);
15559 } else if (r->anchored_utf8) {
15560 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15561 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15562 PerlIO_printf(Perl_debug_log,
15563 "anchored utf8 %s%s at %"IVdf" ",
15564 s, RE_SV_TAIL(r->anchored_utf8),
15565 (IV)r->anchored_offset);
15567 if (r->float_substr) {
15568 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15569 RE_SV_DUMPLEN(r->float_substr), 30);
15570 PerlIO_printf(Perl_debug_log,
15571 "floating %s%s at %"IVdf"..%"UVuf" ",
15572 s, RE_SV_TAIL(r->float_substr),
15573 (IV)r->float_min_offset, (UV)r->float_max_offset);
15574 } else if (r->float_utf8) {
15575 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15576 RE_SV_DUMPLEN(r->float_utf8), 30);
15577 PerlIO_printf(Perl_debug_log,
15578 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15579 s, RE_SV_TAIL(r->float_utf8),
15580 (IV)r->float_min_offset, (UV)r->float_max_offset);
15582 if (r->check_substr || r->check_utf8)
15583 PerlIO_printf(Perl_debug_log,
15585 (r->check_substr == r->float_substr
15586 && r->check_utf8 == r->float_utf8
15587 ? "(checking floating" : "(checking anchored"));
15588 if (r->intflags & PREGf_NOSCAN)
15589 PerlIO_printf(Perl_debug_log, " noscan");
15590 if (r->extflags & RXf_CHECK_ALL)
15591 PerlIO_printf(Perl_debug_log, " isall");
15592 if (r->check_substr || r->check_utf8)
15593 PerlIO_printf(Perl_debug_log, ") ");
15595 if (ri->regstclass) {
15596 regprop(r, sv, ri->regstclass, NULL);
15597 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15599 if (r->intflags & PREGf_ANCH) {
15600 PerlIO_printf(Perl_debug_log, "anchored");
15601 if (r->intflags & PREGf_ANCH_BOL)
15602 PerlIO_printf(Perl_debug_log, "(BOL)");
15603 if (r->intflags & PREGf_ANCH_MBOL)
15604 PerlIO_printf(Perl_debug_log, "(MBOL)");
15605 if (r->intflags & PREGf_ANCH_SBOL)
15606 PerlIO_printf(Perl_debug_log, "(SBOL)");
15607 if (r->intflags & PREGf_ANCH_GPOS)
15608 PerlIO_printf(Perl_debug_log, "(GPOS)");
15609 PerlIO_putc(Perl_debug_log, ' ');
15611 if (r->intflags & PREGf_GPOS_SEEN)
15612 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15613 if (r->intflags & PREGf_SKIP)
15614 PerlIO_printf(Perl_debug_log, "plus ");
15615 if (r->intflags & PREGf_IMPLICIT)
15616 PerlIO_printf(Perl_debug_log, "implicit ");
15617 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15618 if (r->extflags & RXf_EVAL_SEEN)
15619 PerlIO_printf(Perl_debug_log, "with eval ");
15620 PerlIO_printf(Perl_debug_log, "\n");
15622 regdump_extflags("r->extflags: ",r->extflags);
15623 regdump_intflags("r->intflags: ",r->intflags);
15626 PERL_ARGS_ASSERT_REGDUMP;
15627 PERL_UNUSED_CONTEXT;
15628 PERL_UNUSED_ARG(r);
15629 #endif /* DEBUGGING */
15633 - regprop - printable representation of opcode, with run time support
15637 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15643 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15644 static const char * const anyofs[] = {
15645 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15646 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15647 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15648 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15649 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15650 || _CC_VERTSPACE != 16
15651 #error Need to adjust order of anyofs[]
15688 RXi_GET_DECL(prog,progi);
15689 GET_RE_DEBUG_FLAGS_DECL;
15691 PERL_ARGS_ASSERT_REGPROP;
15695 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15696 /* It would be nice to FAIL() here, but this may be called from
15697 regexec.c, and it would be hard to supply pRExC_state. */
15698 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15699 (int)OP(o), (int)REGNODE_MAX);
15700 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15702 k = PL_regkind[OP(o)];
15705 sv_catpvs(sv, " ");
15706 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15707 * is a crude hack but it may be the best for now since
15708 * we have no flag "this EXACTish node was UTF-8"
15710 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15711 PERL_PV_ESCAPE_UNI_DETECT |
15712 PERL_PV_ESCAPE_NONASCII |
15713 PERL_PV_PRETTY_ELLIPSES |
15714 PERL_PV_PRETTY_LTGT |
15715 PERL_PV_PRETTY_NOCLEAR
15717 } else if (k == TRIE) {
15718 /* print the details of the trie in dumpuntil instead, as
15719 * progi->data isn't available here */
15720 const char op = OP(o);
15721 const U32 n = ARG(o);
15722 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15723 (reg_ac_data *)progi->data->data[n] :
15725 const reg_trie_data * const trie
15726 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15728 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15729 DEBUG_TRIE_COMPILE_r(
15730 Perl_sv_catpvf(aTHX_ sv,
15731 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15732 (UV)trie->startstate,
15733 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15734 (UV)trie->wordcount,
15737 (UV)TRIE_CHARCOUNT(trie),
15738 (UV)trie->uniquecharcount
15741 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15742 sv_catpvs(sv, "[");
15743 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15745 : TRIE_BITMAP(trie));
15746 sv_catpvs(sv, "]");
15749 } else if (k == CURLY) {
15750 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15751 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15752 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15754 else if (k == WHILEM && o->flags) /* Ordinal/of */
15755 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15756 else if (k == REF || k == OPEN || k == CLOSE
15757 || k == GROUPP || OP(o)==ACCEPT)
15759 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15760 if ( RXp_PAREN_NAMES(prog) ) {
15761 if ( k != REF || (OP(o) < NREF)) {
15762 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15763 SV **name= av_fetch(list, ARG(o), 0 );
15765 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15768 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15769 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15770 I32 *nums=(I32*)SvPVX(sv_dat);
15771 SV **name= av_fetch(list, nums[0], 0 );
15774 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15775 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15776 (n ? "," : ""), (IV)nums[n]);
15778 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15782 if ( k == REF && reginfo) {
15783 U32 n = ARG(o); /* which paren pair */
15784 I32 ln = prog->offs[n].start;
15785 if (prog->lastparen < n || ln == -1)
15786 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15787 else if (ln == prog->offs[n].end)
15788 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15790 const char *s = reginfo->strbeg + ln;
15791 Perl_sv_catpvf(aTHX_ sv, ": ");
15792 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15793 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15796 } else if (k == GOSUB)
15797 /* Paren and offset */
15798 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15799 else if (k == VERB) {
15801 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15802 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15803 } else if (k == LOGICAL)
15804 /* 2: embedded, otherwise 1 */
15805 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15806 else if (k == ANYOF) {
15807 const U8 flags = ANYOF_FLAGS(o);
15811 if (flags & ANYOF_LOCALE_FLAGS)
15812 sv_catpvs(sv, "{loc}");
15813 if (flags & ANYOF_LOC_FOLD)
15814 sv_catpvs(sv, "{i}");
15815 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15816 if (flags & ANYOF_INVERT)
15817 sv_catpvs(sv, "^");
15819 /* output what the standard cp 0-255 bitmap matches */
15820 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15822 /* output any special charclass tests (used entirely under use
15824 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15826 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15827 if (ANYOF_POSIXL_TEST(o,i)) {
15828 sv_catpv(sv, anyofs[i]);
15834 if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15836 |ANYOF_NONBITMAP_NON_UTF8
15840 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15841 if (flags & ANYOF_INVERT)
15842 /*make sure the invert info is in each */
15843 sv_catpvs(sv, "^");
15846 if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15847 sv_catpvs(sv, "{non-utf8-latin1-all}");
15850 /* output information about the unicode matching */
15851 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15852 sv_catpvs(sv, "{unicode_all}");
15853 else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15854 SV *lv; /* Set if there is something outside the bit map. */
15855 bool byte_output = FALSE; /* If something in the bitmap has
15857 SV *only_utf8_locale;
15859 /* Get the stuff that wasn't in the bitmap */
15860 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15861 &lv, &only_utf8_locale);
15862 if (lv && lv != &PL_sv_undef) {
15863 char *s = savesvpv(lv);
15864 char * const origs = s;
15866 while (*s && *s != '\n')
15870 const char * const t = ++s;
15872 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15873 sv_catpvs(sv, "{outside bitmap}");
15876 sv_catpvs(sv, "{utf8}");
15880 sv_catpvs(sv, " ");
15886 /* Truncate very long output */
15887 if (s - origs > 256) {
15888 Perl_sv_catpvf(aTHX_ sv,
15890 (int) (s - origs - 1),
15896 else if (*s == '\t') {
15910 SvREFCNT_dec_NN(lv);
15913 if ((flags & ANYOF_LOC_FOLD)
15914 && only_utf8_locale
15915 && only_utf8_locale != &PL_sv_undef)
15918 int max_entries = 256;
15920 sv_catpvs(sv, "{utf8 locale}");
15921 invlist_iterinit(only_utf8_locale);
15922 while (invlist_iternext(only_utf8_locale,
15924 put_range(sv, start, end);
15926 if (max_entries < 0) {
15927 sv_catpvs(sv, "...");
15931 invlist_iterfinish(only_utf8_locale);
15936 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15938 else if (k == POSIXD || k == NPOSIXD) {
15939 U8 index = FLAGS(o) * 2;
15940 if (index < C_ARRAY_LENGTH(anyofs)) {
15941 if (*anyofs[index] != '[') {
15944 sv_catpv(sv, anyofs[index]);
15945 if (*anyofs[index] != '[') {
15950 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15953 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15954 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15956 PERL_UNUSED_CONTEXT;
15957 PERL_UNUSED_ARG(sv);
15958 PERL_UNUSED_ARG(o);
15959 PERL_UNUSED_ARG(prog);
15960 PERL_UNUSED_ARG(reginfo);
15961 #endif /* DEBUGGING */
15967 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15968 { /* Assume that RE_INTUIT is set */
15970 struct regexp *const prog = ReANY(r);
15971 GET_RE_DEBUG_FLAGS_DECL;
15973 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15974 PERL_UNUSED_CONTEXT;
15978 const char * const s = SvPV_nolen_const(prog->check_substr
15979 ? prog->check_substr : prog->check_utf8);
15981 if (!PL_colorset) reginitcolors();
15982 PerlIO_printf(Perl_debug_log,
15983 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15985 prog->check_substr ? "" : "utf8 ",
15986 PL_colors[5],PL_colors[0],
15989 (strlen(s) > 60 ? "..." : ""));
15992 return prog->check_substr ? prog->check_substr : prog->check_utf8;
15998 handles refcounting and freeing the perl core regexp structure. When
15999 it is necessary to actually free the structure the first thing it
16000 does is call the 'free' method of the regexp_engine associated to
16001 the regexp, allowing the handling of the void *pprivate; member
16002 first. (This routine is not overridable by extensions, which is why
16003 the extensions free is called first.)
16005 See regdupe and regdupe_internal if you change anything here.
16007 #ifndef PERL_IN_XSUB_RE
16009 Perl_pregfree(pTHX_ REGEXP *r)
16015 Perl_pregfree2(pTHX_ REGEXP *rx)
16018 struct regexp *const r = ReANY(rx);
16019 GET_RE_DEBUG_FLAGS_DECL;
16021 PERL_ARGS_ASSERT_PREGFREE2;
16023 if (r->mother_re) {
16024 ReREFCNT_dec(r->mother_re);
16026 CALLREGFREE_PVT(rx); /* free the private data */
16027 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16028 Safefree(r->xpv_len_u.xpvlenu_pv);
16031 SvREFCNT_dec(r->anchored_substr);
16032 SvREFCNT_dec(r->anchored_utf8);
16033 SvREFCNT_dec(r->float_substr);
16034 SvREFCNT_dec(r->float_utf8);
16035 Safefree(r->substrs);
16037 RX_MATCH_COPY_FREE(rx);
16038 #ifdef PERL_ANY_COW
16039 SvREFCNT_dec(r->saved_copy);
16042 SvREFCNT_dec(r->qr_anoncv);
16043 rx->sv_u.svu_rx = 0;
16048 This is a hacky workaround to the structural issue of match results
16049 being stored in the regexp structure which is in turn stored in
16050 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16051 could be PL_curpm in multiple contexts, and could require multiple
16052 result sets being associated with the pattern simultaneously, such
16053 as when doing a recursive match with (??{$qr})
16055 The solution is to make a lightweight copy of the regexp structure
16056 when a qr// is returned from the code executed by (??{$qr}) this
16057 lightweight copy doesn't actually own any of its data except for
16058 the starp/end and the actual regexp structure itself.
16064 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16066 struct regexp *ret;
16067 struct regexp *const r = ReANY(rx);
16068 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16070 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16073 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16075 SvOK_off((SV *)ret_x);
16077 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16078 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16079 made both spots point to the same regexp body.) */
16080 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16081 assert(!SvPVX(ret_x));
16082 ret_x->sv_u.svu_rx = temp->sv_any;
16083 temp->sv_any = NULL;
16084 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16085 SvREFCNT_dec_NN(temp);
16086 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16087 ing below will not set it. */
16088 SvCUR_set(ret_x, SvCUR(rx));
16091 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16092 sv_force_normal(sv) is called. */
16094 ret = ReANY(ret_x);
16096 SvFLAGS(ret_x) |= SvUTF8(rx);
16097 /* We share the same string buffer as the original regexp, on which we
16098 hold a reference count, incremented when mother_re is set below.
16099 The string pointer is copied here, being part of the regexp struct.
16101 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16102 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16104 const I32 npar = r->nparens+1;
16105 Newx(ret->offs, npar, regexp_paren_pair);
16106 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16109 Newx(ret->substrs, 1, struct reg_substr_data);
16110 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16112 SvREFCNT_inc_void(ret->anchored_substr);
16113 SvREFCNT_inc_void(ret->anchored_utf8);
16114 SvREFCNT_inc_void(ret->float_substr);
16115 SvREFCNT_inc_void(ret->float_utf8);
16117 /* check_substr and check_utf8, if non-NULL, point to either their
16118 anchored or float namesakes, and don't hold a second reference. */
16120 RX_MATCH_COPIED_off(ret_x);
16121 #ifdef PERL_ANY_COW
16122 ret->saved_copy = NULL;
16124 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16125 SvREFCNT_inc_void(ret->qr_anoncv);
16131 /* regfree_internal()
16133 Free the private data in a regexp. This is overloadable by
16134 extensions. Perl takes care of the regexp structure in pregfree(),
16135 this covers the *pprivate pointer which technically perl doesn't
16136 know about, however of course we have to handle the
16137 regexp_internal structure when no extension is in use.
16139 Note this is called before freeing anything in the regexp
16144 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16147 struct regexp *const r = ReANY(rx);
16148 RXi_GET_DECL(r,ri);
16149 GET_RE_DEBUG_FLAGS_DECL;
16151 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16157 SV *dsv= sv_newmortal();
16158 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16159 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16160 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16161 PL_colors[4],PL_colors[5],s);
16164 #ifdef RE_TRACK_PATTERN_OFFSETS
16166 Safefree(ri->u.offsets); /* 20010421 MJD */
16168 if (ri->code_blocks) {
16170 for (n = 0; n < ri->num_code_blocks; n++)
16171 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16172 Safefree(ri->code_blocks);
16176 int n = ri->data->count;
16179 /* If you add a ->what type here, update the comment in regcomp.h */
16180 switch (ri->data->what[n]) {
16186 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16189 Safefree(ri->data->data[n]);
16195 { /* Aho Corasick add-on structure for a trie node.
16196 Used in stclass optimization only */
16198 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16200 refcount = --aho->refcount;
16203 PerlMemShared_free(aho->states);
16204 PerlMemShared_free(aho->fail);
16205 /* do this last!!!! */
16206 PerlMemShared_free(ri->data->data[n]);
16207 /* we should only ever get called once, so
16208 * assert as much, and also guard the free
16209 * which /might/ happen twice. At the least
16210 * it will make code anlyzers happy and it
16211 * doesn't cost much. - Yves */
16212 assert(ri->regstclass);
16213 if (ri->regstclass) {
16214 PerlMemShared_free(ri->regstclass);
16215 ri->regstclass = 0;
16222 /* trie structure. */
16224 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16226 refcount = --trie->refcount;
16229 PerlMemShared_free(trie->charmap);
16230 PerlMemShared_free(trie->states);
16231 PerlMemShared_free(trie->trans);
16233 PerlMemShared_free(trie->bitmap);
16235 PerlMemShared_free(trie->jump);
16236 PerlMemShared_free(trie->wordinfo);
16237 /* do this last!!!! */
16238 PerlMemShared_free(ri->data->data[n]);
16243 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16244 ri->data->what[n]);
16247 Safefree(ri->data->what);
16248 Safefree(ri->data);
16254 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16255 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16256 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16259 re_dup - duplicate a regexp.
16261 This routine is expected to clone a given regexp structure. It is only
16262 compiled under USE_ITHREADS.
16264 After all of the core data stored in struct regexp is duplicated
16265 the regexp_engine.dupe method is used to copy any private data
16266 stored in the *pprivate pointer. This allows extensions to handle
16267 any duplication it needs to do.
16269 See pregfree() and regfree_internal() if you change anything here.
16271 #if defined(USE_ITHREADS)
16272 #ifndef PERL_IN_XSUB_RE
16274 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16278 const struct regexp *r = ReANY(sstr);
16279 struct regexp *ret = ReANY(dstr);
16281 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16283 npar = r->nparens+1;
16284 Newx(ret->offs, npar, regexp_paren_pair);
16285 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16287 if (ret->substrs) {
16288 /* Do it this way to avoid reading from *r after the StructCopy().
16289 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16290 cache, it doesn't matter. */
16291 const bool anchored = r->check_substr
16292 ? r->check_substr == r->anchored_substr
16293 : r->check_utf8 == r->anchored_utf8;
16294 Newx(ret->substrs, 1, struct reg_substr_data);
16295 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16297 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16298 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16299 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16300 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16302 /* check_substr and check_utf8, if non-NULL, point to either their
16303 anchored or float namesakes, and don't hold a second reference. */
16305 if (ret->check_substr) {
16307 assert(r->check_utf8 == r->anchored_utf8);
16308 ret->check_substr = ret->anchored_substr;
16309 ret->check_utf8 = ret->anchored_utf8;
16311 assert(r->check_substr == r->float_substr);
16312 assert(r->check_utf8 == r->float_utf8);
16313 ret->check_substr = ret->float_substr;
16314 ret->check_utf8 = ret->float_utf8;
16316 } else if (ret->check_utf8) {
16318 ret->check_utf8 = ret->anchored_utf8;
16320 ret->check_utf8 = ret->float_utf8;
16325 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16326 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16329 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16331 if (RX_MATCH_COPIED(dstr))
16332 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16334 ret->subbeg = NULL;
16335 #ifdef PERL_ANY_COW
16336 ret->saved_copy = NULL;
16339 /* Whether mother_re be set or no, we need to copy the string. We
16340 cannot refrain from copying it when the storage points directly to
16341 our mother regexp, because that's
16342 1: a buffer in a different thread
16343 2: something we no longer hold a reference on
16344 so we need to copy it locally. */
16345 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16346 ret->mother_re = NULL;
16348 #endif /* PERL_IN_XSUB_RE */
16353 This is the internal complement to regdupe() which is used to copy
16354 the structure pointed to by the *pprivate pointer in the regexp.
16355 This is the core version of the extension overridable cloning hook.
16356 The regexp structure being duplicated will be copied by perl prior
16357 to this and will be provided as the regexp *r argument, however
16358 with the /old/ structures pprivate pointer value. Thus this routine
16359 may override any copying normally done by perl.
16361 It returns a pointer to the new regexp_internal structure.
16365 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16368 struct regexp *const r = ReANY(rx);
16369 regexp_internal *reti;
16371 RXi_GET_DECL(r,ri);
16373 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16377 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16378 char, regexp_internal);
16379 Copy(ri->program, reti->program, len+1, regnode);
16381 reti->num_code_blocks = ri->num_code_blocks;
16382 if (ri->code_blocks) {
16384 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16385 struct reg_code_block);
16386 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16387 struct reg_code_block);
16388 for (n = 0; n < ri->num_code_blocks; n++)
16389 reti->code_blocks[n].src_regex = (REGEXP*)
16390 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16393 reti->code_blocks = NULL;
16395 reti->regstclass = NULL;
16398 struct reg_data *d;
16399 const int count = ri->data->count;
16402 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16403 char, struct reg_data);
16404 Newx(d->what, count, U8);
16407 for (i = 0; i < count; i++) {
16408 d->what[i] = ri->data->what[i];
16409 switch (d->what[i]) {
16410 /* see also regcomp.h and regfree_internal() */
16411 case 'a': /* actually an AV, but the dup function is identical. */
16415 case 'u': /* actually an HV, but the dup function is identical. */
16416 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16419 /* This is cheating. */
16420 Newx(d->data[i], 1, regnode_ssc);
16421 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16422 reti->regstclass = (regnode*)d->data[i];
16425 /* Trie stclasses are readonly and can thus be shared
16426 * without duplication. We free the stclass in pregfree
16427 * when the corresponding reg_ac_data struct is freed.
16429 reti->regstclass= ri->regstclass;
16433 ((reg_trie_data*)ri->data->data[i])->refcount++;
16438 d->data[i] = ri->data->data[i];
16441 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16442 ri->data->what[i]);
16451 reti->name_list_idx = ri->name_list_idx;
16453 #ifdef RE_TRACK_PATTERN_OFFSETS
16454 if (ri->u.offsets) {
16455 Newx(reti->u.offsets, 2*len+1, U32);
16456 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16459 SetProgLen(reti,len);
16462 return (void*)reti;
16465 #endif /* USE_ITHREADS */
16467 #ifndef PERL_IN_XSUB_RE
16470 - regnext - dig the "next" pointer out of a node
16473 Perl_regnext(pTHX_ regnode *p)
16481 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16482 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16483 (int)OP(p), (int)REGNODE_MAX);
16486 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16495 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16498 STRLEN l1 = strlen(pat1);
16499 STRLEN l2 = strlen(pat2);
16502 const char *message;
16504 PERL_ARGS_ASSERT_RE_CROAK2;
16510 Copy(pat1, buf, l1 , char);
16511 Copy(pat2, buf + l1, l2 , char);
16512 buf[l1 + l2] = '\n';
16513 buf[l1 + l2 + 1] = '\0';
16514 va_start(args, pat2);
16515 msv = vmess(buf, &args);
16517 message = SvPV_const(msv,l1);
16520 Copy(message, buf, l1 , char);
16521 /* l1-1 to avoid \n */
16522 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16525 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16527 #ifndef PERL_IN_XSUB_RE
16529 Perl_save_re_context(pTHX)
16533 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16535 const REGEXP * const rx = PM_GETRE(PL_curpm);
16538 for (i = 1; i <= RX_NPARENS(rx); i++) {
16539 char digits[TYPE_CHARS(long)];
16540 const STRLEN len = my_snprintf(digits, sizeof(digits),
16542 GV *const *const gvp
16543 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16546 GV * const gv = *gvp;
16547 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16559 S_put_byte(pTHX_ SV *sv, int c)
16561 PERL_ARGS_ASSERT_PUT_BYTE;
16565 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16566 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16567 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16568 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16569 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16572 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16577 const char string = c;
16578 if (c == '-' || c == ']' || c == '\\' || c == '^')
16579 sv_catpvs(sv, "\\");
16580 sv_catpvn(sv, &string, 1);
16585 S_put_range(pTHX_ SV *sv, UV start, UV end)
16588 /* Appends to 'sv' a displayable version of the range of code points from
16589 * 'start' to 'end' */
16591 assert(start <= end);
16593 PERL_ARGS_ASSERT_PUT_RANGE;
16595 if (end - start < 3) { /* Individual chars in short ranges */
16596 for (; start <= end; start++)
16597 put_byte(sv, start);
16599 else if ( end > 255
16600 || ! isALPHANUMERIC(start)
16601 || ! isALPHANUMERIC(end)
16602 || isDIGIT(start) != isDIGIT(end)
16603 || isUPPER(start) != isUPPER(end)
16604 || isLOWER(start) != isLOWER(end)
16606 /* This final test should get optimized out except on EBCDIC
16607 * platforms, where it causes ranges that cross discontinuities
16608 * like i/j to be shown as hex instead of the misleading,
16609 * e.g. H-K (since that range includes more than H, I, J, K).
16611 || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16613 Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16615 (end < 256) ? end : 255);
16617 else { /* Here, the ends of the range are both digits, or both uppercase,
16618 or both lowercase; and there's no discontinuity in the range
16619 (which could happen on EBCDIC platforms) */
16620 put_byte(sv, start);
16621 sv_catpvs(sv, "-");
16627 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16629 /* Appends to 'sv' a displayable version of the innards of the bracketed
16630 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16631 * output anything */
16634 bool has_output_anything = FALSE;
16636 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16638 for (i = 0; i < 256; i++) {
16639 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16641 /* The character at index i should be output. Find the next
16642 * character that should NOT be output */
16644 for (j = i + 1; j <= 256; j++) {
16645 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16650 /* Everything between them is a single range that should be output
16652 put_range(sv, i, j - 1);
16653 has_output_anything = TRUE;
16658 return has_output_anything;
16661 #define CLEAR_OPTSTART \
16662 if (optstart) STMT_START { \
16663 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
16664 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16668 #define DUMPUNTIL(b,e) \
16670 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16672 STATIC const regnode *
16673 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16674 const regnode *last, const regnode *plast,
16675 SV* sv, I32 indent, U32 depth)
16678 U8 op = PSEUDO; /* Arbitrary non-END op. */
16679 const regnode *next;
16680 const regnode *optstart= NULL;
16682 RXi_GET_DECL(r,ri);
16683 GET_RE_DEBUG_FLAGS_DECL;
16685 PERL_ARGS_ASSERT_DUMPUNTIL;
16687 #ifdef DEBUG_DUMPUNTIL
16688 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16689 last ? last-start : 0,plast ? plast-start : 0);
16692 if (plast && plast < last)
16695 while (PL_regkind[op] != END && (!last || node < last)) {
16697 /* While that wasn't END last time... */
16700 if (op == CLOSE || op == WHILEM)
16702 next = regnext((regnode *)node);
16705 if (OP(node) == OPTIMIZED) {
16706 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16713 regprop(r, sv, node, NULL);
16714 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16715 (int)(2*indent + 1), "", SvPVX_const(sv));
16717 if (OP(node) != OPTIMIZED) {
16718 if (next == NULL) /* Next ptr. */
16719 PerlIO_printf(Perl_debug_log, " (0)");
16720 else if (PL_regkind[(U8)op] == BRANCH
16721 && PL_regkind[OP(next)] != BRANCH )
16722 PerlIO_printf(Perl_debug_log, " (FAIL)");
16724 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16725 (void)PerlIO_putc(Perl_debug_log, '\n');
16729 if (PL_regkind[(U8)op] == BRANCHJ) {
16732 const regnode *nnode = (OP(next) == LONGJMP
16733 ? regnext((regnode *)next)
16735 if (last && nnode > last)
16737 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16740 else if (PL_regkind[(U8)op] == BRANCH) {
16742 DUMPUNTIL(NEXTOPER(node), next);
16744 else if ( PL_regkind[(U8)op] == TRIE ) {
16745 const regnode *this_trie = node;
16746 const char op = OP(node);
16747 const U32 n = ARG(node);
16748 const reg_ac_data * const ac = op>=AHOCORASICK ?
16749 (reg_ac_data *)ri->data->data[n] :
16751 const reg_trie_data * const trie =
16752 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16754 AV *const trie_words
16755 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16757 const regnode *nextbranch= NULL;
16760 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16761 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16763 PerlIO_printf(Perl_debug_log, "%*s%s ",
16764 (int)(2*(indent+3)), "",
16766 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16767 SvCUR(*elem_ptr), 60,
16768 PL_colors[0], PL_colors[1],
16770 ? PERL_PV_ESCAPE_UNI
16772 | PERL_PV_PRETTY_ELLIPSES
16773 | PERL_PV_PRETTY_LTGT
16778 U16 dist= trie->jump[word_idx+1];
16779 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16780 (UV)((dist ? this_trie + dist : next) - start));
16783 nextbranch= this_trie + trie->jump[0];
16784 DUMPUNTIL(this_trie + dist, nextbranch);
16786 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16787 nextbranch= regnext((regnode *)nextbranch);
16789 PerlIO_printf(Perl_debug_log, "\n");
16792 if (last && next > last)
16797 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16798 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16799 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16801 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16803 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16805 else if ( op == PLUS || op == STAR) {
16806 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16808 else if (PL_regkind[(U8)op] == ANYOF) {
16809 /* arglen 1 + class block */
16810 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16811 ? ANYOF_POSIXL_SKIP
16813 node = NEXTOPER(node);
16815 else if (PL_regkind[(U8)op] == EXACT) {
16816 /* Literal string, where present. */
16817 node += NODE_SZ_STR(node) - 1;
16818 node = NEXTOPER(node);
16821 node = NEXTOPER(node);
16822 node += regarglen[(U8)op];
16824 if (op == CURLYX || op == OPEN)
16828 #ifdef DEBUG_DUMPUNTIL
16829 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16834 #endif /* DEBUGGING */
16838 * c-indentation-style: bsd
16839 * c-basic-offset: 4
16840 * indent-tabs-mode: nil
16843 * ex: set ts=8 sts=4 sw=4 et: