5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 struct RExC_state_t {
107 U32 flags; /* RXf_* are we folding, multilining? */
108 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 SSize_t whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit_bound; /* First regnode outside of the
121 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
122 implies compiling, so don't emit */
123 regnode_ssc emit_dummy; /* placeholder for emit to point to;
124 large enough for the largest
125 non-EXACTish node, so can use it as
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 SSize_t size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN) plus
132 one. ("par" 0 is the whole
134 I32 nestroot; /* root parens we are in - used by
138 regnode **open_parens; /* pointers to open parens */
139 regnode **close_parens; /* pointers to close parens */
140 regnode *opend; /* END node in program */
141 I32 utf8; /* whether the pattern is utf8 or not */
142 I32 orig_utf8; /* whether the pattern was originally in utf8 */
143 /* XXX use this for future optimisation of case
144 * where pattern must be upgraded to utf8. */
145 I32 uni_semantics; /* If a d charset modifier should use unicode
146 rules, even if the pattern is not in
148 HV *paren_names; /* Paren names */
150 regnode **recurse; /* Recurse regops */
151 I32 recurse_count; /* Number of recurse regops */
152 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
154 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
158 I32 override_recoding;
159 I32 in_multi_char_class;
160 struct reg_code_block *code_blocks; /* positions of literal (?{})
162 int num_code_blocks; /* size of code_blocks[] */
163 int code_index; /* next code_blocks[] slot */
164 SSize_t maxlen; /* mininum possible number of chars in string to match */
165 #ifdef ADD_TO_REGEXEC
166 char *starttry; /* -Dr: where regtry was called. */
167 #define RExC_starttry (pRExC_state->starttry)
169 SV *runtime_code_qr; /* qr with the runtime code blocks */
171 const char *lastparse;
173 AV *paren_name_list; /* idx -> name */
174 #define RExC_lastparse (pRExC_state->lastparse)
175 #define RExC_lastnum (pRExC_state->lastnum)
176 #define RExC_paren_name_list (pRExC_state->paren_name_list)
180 #define RExC_flags (pRExC_state->flags)
181 #define RExC_pm_flags (pRExC_state->pm_flags)
182 #define RExC_precomp (pRExC_state->precomp)
183 #define RExC_rx_sv (pRExC_state->rx_sv)
184 #define RExC_rx (pRExC_state->rx)
185 #define RExC_rxi (pRExC_state->rxi)
186 #define RExC_start (pRExC_state->start)
187 #define RExC_end (pRExC_state->end)
188 #define RExC_parse (pRExC_state->parse)
189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
190 #ifdef RE_TRACK_PATTERN_OFFSETS
191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
194 #define RExC_emit (pRExC_state->emit)
195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_maxlen (pRExC_state->maxlen)
203 #define RExC_npar (pRExC_state->npar)
204 #define RExC_nestroot (pRExC_state->nestroot)
205 #define RExC_extralen (pRExC_state->extralen)
206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
207 #define RExC_utf8 (pRExC_state->utf8)
208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
210 #define RExC_open_parens (pRExC_state->open_parens)
211 #define RExC_close_parens (pRExC_state->close_parens)
212 #define RExC_opend (pRExC_state->opend)
213 #define RExC_paren_names (pRExC_state->paren_names)
214 #define RExC_recurse (pRExC_state->recurse)
215 #define RExC_recurse_count (pRExC_state->recurse_count)
216 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
217 #define RExC_study_chunk_recursed_bytes \
218 (pRExC_state->study_chunk_recursed_bytes)
219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
220 #define RExC_contains_locale (pRExC_state->contains_locale)
221 #define RExC_contains_i (pRExC_state->contains_i)
222 #define RExC_override_recoding (pRExC_state->override_recoding)
223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
228 ((*s) == '{' && regcurly(s)))
231 * Flags to be passed up and down.
233 #define WORST 0 /* Worst case. */
234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
237 * character. (There needs to be a case: in the switch statement in regexec.c
238 * for any node marked SIMPLE.) Note that this is not the same thing as
241 #define SPSTART 0x04 /* Starts with * or + */
242 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
244 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
248 /* whether trie related optimizations are enabled */
249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
250 #define TRIE_STUDY_OPT
251 #define FULL_TRIE_STUDY
257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
258 #define PBITVAL(paren) (1 << ((paren) & 7))
259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
263 #define REQUIRE_UTF8 STMT_START { \
265 *flagp = RESTART_UTF8; \
270 /* This converts the named class defined in regcomp.h to its equivalent class
271 * number defined in handy.h. */
272 #define namedclass_to_classnum(class) ((int) ((class) / 2))
273 #define classnum_to_namedclass(classnum) ((classnum) * 2)
275 #define _invlist_union_complement_2nd(a, b, output) \
276 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
277 #define _invlist_intersection_complement_2nd(a, b, output) \
278 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
280 /* About scan_data_t.
282 During optimisation we recurse through the regexp program performing
283 various inplace (keyhole style) optimisations. In addition study_chunk
284 and scan_commit populate this data structure with information about
285 what strings MUST appear in the pattern. We look for the longest
286 string that must appear at a fixed location, and we look for the
287 longest string that may appear at a floating location. So for instance
292 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
293 strings (because they follow a .* construct). study_chunk will identify
294 both FOO and BAR as being the longest fixed and floating strings respectively.
296 The strings can be composites, for instance
300 will result in a composite fixed substring 'foo'.
302 For each string some basic information is maintained:
304 - offset or min_offset
305 This is the position the string must appear at, or not before.
306 It also implicitly (when combined with minlenp) tells us how many
307 characters must match before the string we are searching for.
308 Likewise when combined with minlenp and the length of the string it
309 tells us how many characters must appear after the string we have
313 Only used for floating strings. This is the rightmost point that
314 the string can appear at. If set to SSize_t_MAX it indicates that the
315 string can occur infinitely far to the right.
318 A pointer to the minimum number of characters of the pattern that the
319 string was found inside. This is important as in the case of positive
320 lookahead or positive lookbehind we can have multiple patterns
325 The minimum length of the pattern overall is 3, the minimum length
326 of the lookahead part is 3, but the minimum length of the part that
327 will actually match is 1. So 'FOO's minimum length is 3, but the
328 minimum length for the F is 1. This is important as the minimum length
329 is used to determine offsets in front of and behind the string being
330 looked for. Since strings can be composites this is the length of the
331 pattern at the time it was committed with a scan_commit. Note that
332 the length is calculated by study_chunk, so that the minimum lengths
333 are not known until the full pattern has been compiled, thus the
334 pointer to the value.
338 In the case of lookbehind the string being searched for can be
339 offset past the start point of the final matching string.
340 If this value was just blithely removed from the min_offset it would
341 invalidate some of the calculations for how many chars must match
342 before or after (as they are derived from min_offset and minlen and
343 the length of the string being searched for).
344 When the final pattern is compiled and the data is moved from the
345 scan_data_t structure into the regexp structure the information
346 about lookbehind is factored in, with the information that would
347 have been lost precalculated in the end_shift field for the
350 The fields pos_min and pos_delta are used to store the minimum offset
351 and the delta to the maximum offset at the current point in the pattern.
355 typedef struct scan_data_t {
356 /*I32 len_min; unused */
357 /*I32 len_delta; unused */
361 SSize_t last_end; /* min value, <0 unless valid. */
362 SSize_t last_start_min;
363 SSize_t last_start_max;
364 SV **longest; /* Either &l_fixed, or &l_float. */
365 SV *longest_fixed; /* longest fixed string found in pattern */
366 SSize_t offset_fixed; /* offset where it starts */
367 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
368 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
369 SV *longest_float; /* longest floating string found in pattern */
370 SSize_t offset_float_min; /* earliest point in string it can appear */
371 SSize_t offset_float_max; /* latest point in string it can appear */
372 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
373 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
376 SSize_t *last_closep;
377 regnode_ssc *start_class;
380 /* The below is perhaps overboard, but this allows us to save a test at the
381 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
382 * and 'a' differ by a single bit; the same with the upper and lower case of
383 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
384 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
385 * then inverts it to form a mask, with just a single 0, in the bit position
386 * where the upper- and lowercase differ. XXX There are about 40 other
387 * instances in the Perl core where this micro-optimization could be used.
388 * Should decide if maintenance cost is worse, before changing those
390 * Returns a boolean as to whether or not 'v' is either a lowercase or
391 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
392 * compile-time constant, the generated code is better than some optimizing
393 * compilers figure out, amounting to a mask and test. The results are
394 * meaningless if 'c' is not one of [A-Za-z] */
395 #define isARG2_lower_or_UPPER_ARG1(c, v) \
396 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
399 * Forward declarations for pregcomp()'s friends.
402 static const scan_data_t zero_scan_data =
403 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
405 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
406 #define SF_BEFORE_SEOL 0x0001
407 #define SF_BEFORE_MEOL 0x0002
408 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
409 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
411 #define SF_FIX_SHIFT_EOL (+2)
412 #define SF_FL_SHIFT_EOL (+4)
414 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
415 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
417 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
418 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
419 #define SF_IS_INF 0x0040
420 #define SF_HAS_PAR 0x0080
421 #define SF_IN_PAR 0x0100
422 #define SF_HAS_EVAL 0x0200
423 #define SCF_DO_SUBSTR 0x0400
424 #define SCF_DO_STCLASS_AND 0x0800
425 #define SCF_DO_STCLASS_OR 0x1000
426 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
427 #define SCF_WHILEM_VISITED_POS 0x2000
429 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
430 #define SCF_SEEN_ACCEPT 0x8000
431 #define SCF_TRIE_DOING_RESTUDY 0x10000
433 #define UTF cBOOL(RExC_utf8)
435 /* The enums for all these are ordered so things work out correctly */
436 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
437 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
438 == REGEX_DEPENDS_CHARSET)
439 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
440 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
441 >= REGEX_UNICODE_CHARSET)
442 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
443 == REGEX_ASCII_RESTRICTED_CHARSET)
444 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
445 >= REGEX_ASCII_RESTRICTED_CHARSET)
446 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
447 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
449 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
451 /* For programs that want to be strictly Unicode compatible by dying if any
452 * attempt is made to match a non-Unicode code point against a Unicode
454 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
456 #define OOB_NAMEDCLASS -1
458 /* There is no code point that is out-of-bounds, so this is problematic. But
459 * its only current use is to initialize a variable that is always set before
461 #define OOB_UNICODE 0xDEADBEEF
463 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
464 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
467 /* length of regex to show in messages that don't mark a position within */
468 #define RegexLengthToShowInErrorMessages 127
471 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
472 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
473 * op/pragma/warn/regcomp.
475 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
476 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
478 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
479 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
481 #define REPORT_LOCATION_ARGS(offset) \
482 UTF8fARG(UTF, offset, RExC_precomp), \
483 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
486 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
487 * arg. Show regex, up to a maximum length. If it's too long, chop and add
490 #define _FAIL(code) STMT_START { \
491 const char *ellipses = ""; \
492 IV len = RExC_end - RExC_precomp; \
495 SAVEFREESV(RExC_rx_sv); \
496 if (len > RegexLengthToShowInErrorMessages) { \
497 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
498 len = RegexLengthToShowInErrorMessages - 10; \
504 #define FAIL(msg) _FAIL( \
505 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
506 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
508 #define FAIL2(msg,arg) _FAIL( \
509 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
510 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
513 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
515 #define Simple_vFAIL(m) STMT_START { \
516 const IV offset = RExC_parse - RExC_precomp; \
517 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
518 m, REPORT_LOCATION_ARGS(offset)); \
522 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
524 #define vFAIL(m) STMT_START { \
526 SAVEFREESV(RExC_rx_sv); \
531 * Like Simple_vFAIL(), but accepts two arguments.
533 #define Simple_vFAIL2(m,a1) STMT_START { \
534 const IV offset = RExC_parse - RExC_precomp; \
535 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
536 REPORT_LOCATION_ARGS(offset)); \
540 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
542 #define vFAIL2(m,a1) STMT_START { \
544 SAVEFREESV(RExC_rx_sv); \
545 Simple_vFAIL2(m, a1); \
550 * Like Simple_vFAIL(), but accepts three arguments.
552 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
553 const IV offset = RExC_parse - RExC_precomp; \
554 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
555 REPORT_LOCATION_ARGS(offset)); \
559 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
561 #define vFAIL3(m,a1,a2) STMT_START { \
563 SAVEFREESV(RExC_rx_sv); \
564 Simple_vFAIL3(m, a1, a2); \
568 * Like Simple_vFAIL(), but accepts four arguments.
570 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
571 const IV offset = RExC_parse - RExC_precomp; \
572 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
573 REPORT_LOCATION_ARGS(offset)); \
576 #define vFAIL4(m,a1,a2,a3) STMT_START { \
578 SAVEFREESV(RExC_rx_sv); \
579 Simple_vFAIL4(m, a1, a2, a3); \
582 /* A specialized version of vFAIL2 that works with UTF8f */
583 #define vFAIL2utf8f(m, a1) STMT_START { \
584 const IV offset = RExC_parse - RExC_precomp; \
586 SAVEFREESV(RExC_rx_sv); \
587 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
588 REPORT_LOCATION_ARGS(offset)); \
592 /* m is not necessarily a "literal string", in this macro */
593 #define reg_warn_non_literal_string(loc, m) STMT_START { \
594 const IV offset = loc - RExC_precomp; \
595 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
596 m, REPORT_LOCATION_ARGS(offset)); \
599 #define ckWARNreg(loc,m) STMT_START { \
600 const IV offset = loc - RExC_precomp; \
601 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
602 REPORT_LOCATION_ARGS(offset)); \
605 #define vWARN_dep(loc, m) STMT_START { \
606 const IV offset = loc - RExC_precomp; \
607 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
608 REPORT_LOCATION_ARGS(offset)); \
611 #define ckWARNdep(loc,m) STMT_START { \
612 const IV offset = loc - RExC_precomp; \
613 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
615 REPORT_LOCATION_ARGS(offset)); \
618 #define ckWARNregdep(loc,m) STMT_START { \
619 const IV offset = loc - RExC_precomp; \
620 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
622 REPORT_LOCATION_ARGS(offset)); \
625 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
629 a1, REPORT_LOCATION_ARGS(offset)); \
632 #define ckWARN2reg(loc, m, a1) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
635 a1, REPORT_LOCATION_ARGS(offset)); \
638 #define vWARN3(loc, m, a1, a2) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
641 a1, a2, REPORT_LOCATION_ARGS(offset)); \
644 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
645 const IV offset = loc - RExC_precomp; \
646 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
647 a1, a2, REPORT_LOCATION_ARGS(offset)); \
650 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
653 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
656 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
657 const IV offset = loc - RExC_precomp; \
658 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
659 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
662 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
663 const IV offset = loc - RExC_precomp; \
664 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
665 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
669 /* Allow for side effects in s */
670 #define REGC(c,s) STMT_START { \
671 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
674 /* Macros for recording node offsets. 20001227 mjd@plover.com
675 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
676 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
677 * Element 0 holds the number n.
678 * Position is 1 indexed.
680 #ifndef RE_TRACK_PATTERN_OFFSETS
681 #define Set_Node_Offset_To_R(node,byte)
682 #define Set_Node_Offset(node,byte)
683 #define Set_Cur_Node_Offset
684 #define Set_Node_Length_To_R(node,len)
685 #define Set_Node_Length(node,len)
686 #define Set_Node_Cur_Length(node,start)
687 #define Node_Offset(n)
688 #define Node_Length(n)
689 #define Set_Node_Offset_Length(node,offset,len)
690 #define ProgLen(ri) ri->u.proglen
691 #define SetProgLen(ri,x) ri->u.proglen = x
693 #define ProgLen(ri) ri->u.offsets[0]
694 #define SetProgLen(ri,x) ri->u.offsets[0] = x
695 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
697 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
698 __LINE__, (int)(node), (int)(byte))); \
700 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
703 RExC_offsets[2*(node)-1] = (byte); \
708 #define Set_Node_Offset(node,byte) \
709 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
710 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
712 #define Set_Node_Length_To_R(node,len) STMT_START { \
714 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
715 __LINE__, (int)(node), (int)(len))); \
717 Perl_croak(aTHX_ "value of node is %d in Length macro", \
720 RExC_offsets[2*(node)] = (len); \
725 #define Set_Node_Length(node,len) \
726 Set_Node_Length_To_R((node)-RExC_emit_start, len)
727 #define Set_Node_Cur_Length(node, start) \
728 Set_Node_Length(node, RExC_parse - start)
730 /* Get offsets and lengths */
731 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
732 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
734 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
735 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
736 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
740 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
741 #define EXPERIMENTAL_INPLACESCAN
742 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
744 #define DEBUG_RExC_seen() \
745 DEBUG_OPTIMISE_MORE_r({ \
746 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
748 if (RExC_seen & REG_ZERO_LEN_SEEN) \
749 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
751 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
752 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
754 if (RExC_seen & REG_GPOS_SEEN) \
755 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
757 if (RExC_seen & REG_CANY_SEEN) \
758 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
760 if (RExC_seen & REG_RECURSE_SEEN) \
761 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
763 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
764 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
766 if (RExC_seen & REG_VERBARG_SEEN) \
767 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
769 if (RExC_seen & REG_CUTGROUP_SEEN) \
770 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
772 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
773 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
775 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
776 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
778 if (RExC_seen & REG_GOSTART_SEEN) \
779 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
781 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
782 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
784 PerlIO_printf(Perl_debug_log,"\n"); \
787 #define DEBUG_STUDYDATA(str,data,depth) \
788 DEBUG_OPTIMISE_MORE_r(if(data){ \
789 PerlIO_printf(Perl_debug_log, \
790 "%*s" str "Pos:%"IVdf"/%"IVdf \
791 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
792 (int)(depth)*2, "", \
793 (IV)((data)->pos_min), \
794 (IV)((data)->pos_delta), \
795 (UV)((data)->flags), \
796 (IV)((data)->whilem_c), \
797 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
798 is_inf ? "INF " : "" \
800 if ((data)->last_found) \
801 PerlIO_printf(Perl_debug_log, \
802 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
803 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
804 SvPVX_const((data)->last_found), \
805 (IV)((data)->last_end), \
806 (IV)((data)->last_start_min), \
807 (IV)((data)->last_start_max), \
808 ((data)->longest && \
809 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
810 SvPVX_const((data)->longest_fixed), \
811 (IV)((data)->offset_fixed), \
812 ((data)->longest && \
813 (data)->longest==&((data)->longest_float)) ? "*" : "", \
814 SvPVX_const((data)->longest_float), \
815 (IV)((data)->offset_float_min), \
816 (IV)((data)->offset_float_max) \
818 PerlIO_printf(Perl_debug_log,"\n"); \
821 /* Mark that we cannot extend a found fixed substring at this point.
822 Update the longest found anchored substring and the longest found
823 floating substrings if needed. */
826 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
827 SSize_t *minlenp, int is_inf)
829 const STRLEN l = CHR_SVLEN(data->last_found);
830 const STRLEN old_l = CHR_SVLEN(*data->longest);
831 GET_RE_DEBUG_FLAGS_DECL;
833 PERL_ARGS_ASSERT_SCAN_COMMIT;
835 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
836 SvSetMagicSV(*data->longest, data->last_found);
837 if (*data->longest == data->longest_fixed) {
838 data->offset_fixed = l ? data->last_start_min : data->pos_min;
839 if (data->flags & SF_BEFORE_EOL)
841 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
843 data->flags &= ~SF_FIX_BEFORE_EOL;
844 data->minlen_fixed=minlenp;
845 data->lookbehind_fixed=0;
847 else { /* *data->longest == data->longest_float */
848 data->offset_float_min = l ? data->last_start_min : data->pos_min;
849 data->offset_float_max = (l
850 ? data->last_start_max
851 : (data->pos_delta == SSize_t_MAX
853 : data->pos_min + data->pos_delta));
855 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
856 data->offset_float_max = SSize_t_MAX;
857 if (data->flags & SF_BEFORE_EOL)
859 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
861 data->flags &= ~SF_FL_BEFORE_EOL;
862 data->minlen_float=minlenp;
863 data->lookbehind_float=0;
866 SvCUR_set(data->last_found, 0);
868 SV * const sv = data->last_found;
869 if (SvUTF8(sv) && SvMAGICAL(sv)) {
870 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
876 data->flags &= ~SF_BEFORE_EOL;
877 DEBUG_STUDYDATA("commit: ",data,0);
880 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
881 * list that describes which code points it matches */
884 S_ssc_anything(pTHX_ regnode_ssc *ssc)
886 /* Set the SSC 'ssc' to match an empty string or any code point */
888 PERL_ARGS_ASSERT_SSC_ANYTHING;
890 assert(is_ANYOF_SYNTHETIC(ssc));
892 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
893 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
894 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
898 S_ssc_is_anything(const regnode_ssc *ssc)
900 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
901 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
902 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
903 * in any way, so there's no point in using it */
908 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
910 assert(is_ANYOF_SYNTHETIC(ssc));
912 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
916 /* See if the list consists solely of the range 0 - Infinity */
917 invlist_iterinit(ssc->invlist);
918 ret = invlist_iternext(ssc->invlist, &start, &end)
922 invlist_iterfinish(ssc->invlist);
928 /* If e.g., both \w and \W are set, matches everything */
929 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
931 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
932 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
942 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
944 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
945 * string, any code point, or any posix class under locale */
947 PERL_ARGS_ASSERT_SSC_INIT;
949 Zero(ssc, 1, regnode_ssc);
950 set_ANYOF_SYNTHETIC(ssc);
951 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
954 /* If any portion of the regex is to operate under locale rules,
955 * initialization includes it. The reason this isn't done for all regexes
956 * is that the optimizer was written under the assumption that locale was
957 * all-or-nothing. Given the complexity and lack of documentation in the
958 * optimizer, and that there are inadequate test cases for locale, many
959 * parts of it may not work properly, it is safest to avoid locale unless
961 if (RExC_contains_locale) {
962 ANYOF_POSIXL_SETALL(ssc);
965 ANYOF_POSIXL_ZERO(ssc);
970 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
971 const regnode_ssc *ssc)
973 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
974 * to the list of code points matched, and locale posix classes; hence does
975 * not check its flags) */
980 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
982 assert(is_ANYOF_SYNTHETIC(ssc));
984 invlist_iterinit(ssc->invlist);
985 ret = invlist_iternext(ssc->invlist, &start, &end)
989 invlist_iterfinish(ssc->invlist);
995 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1003 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1004 const regnode_charclass* const node)
1006 /* Returns a mortal inversion list defining which code points are matched
1007 * by 'node', which is of type ANYOF. Handles complementing the result if
1008 * appropriate. If some code points aren't knowable at this time, the
1009 * returned list must, and will, contain every code point that is a
1012 SV* invlist = sv_2mortal(_new_invlist(0));
1013 SV* only_utf8_locale_invlist = NULL;
1015 const U32 n = ARG(node);
1016 bool new_node_has_latin1 = FALSE;
1018 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1020 /* Look at the data structure created by S_set_ANYOF_arg() */
1021 if (n != ANYOF_NONBITMAP_EMPTY) {
1022 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1023 AV * const av = MUTABLE_AV(SvRV(rv));
1024 SV **const ary = AvARRAY(av);
1025 assert(RExC_rxi->data->what[n] == 's');
1027 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1028 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1030 else if (ary[0] && ary[0] != &PL_sv_undef) {
1032 /* Here, no compile-time swash, and there are things that won't be
1033 * known until runtime -- we have to assume it could be anything */
1034 return _add_range_to_invlist(invlist, 0, UV_MAX);
1036 else if (ary[3] && ary[3] != &PL_sv_undef) {
1038 /* Here no compile-time swash, and no run-time only data. Use the
1039 * node's inversion list */
1040 invlist = sv_2mortal(invlist_clone(ary[3]));
1043 /* Get the code points valid only under UTF-8 locales */
1044 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1045 && ary[2] && ary[2] != &PL_sv_undef)
1047 only_utf8_locale_invlist = ary[2];
1051 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1052 * inversion list for the others, but if there are code points that should
1053 * match only conditionally on the target string being UTF-8, those are
1054 * placed in the inversion list, and not the bitmap. Since there are
1055 * circumstances under which they could match, they are included in the
1056 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1057 * here, so that when we invert below, the end result actually does include
1058 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1059 * before we add the unconditionally matched code points */
1060 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1061 _invlist_intersection_complement_2nd(invlist,
1066 /* Add in the points from the bit map */
1067 for (i = 0; i < 256; i++) {
1068 if (ANYOF_BITMAP_TEST(node, i)) {
1069 invlist = add_cp_to_invlist(invlist, i);
1070 new_node_has_latin1 = TRUE;
1074 /* If this can match all upper Latin1 code points, have to add them
1076 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
1077 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1080 /* Similarly for these */
1081 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1082 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1085 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1086 _invlist_invert(invlist);
1088 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1090 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1091 * locale. We can skip this if there are no 0-255 at all. */
1092 _invlist_union(invlist, PL_Latin1, &invlist);
1095 /* Similarly add the UTF-8 locale possible matches. These have to be
1096 * deferred until after the non-UTF-8 locale ones are taken care of just
1097 * above, or it leads to wrong results under ANYOF_INVERT */
1098 if (only_utf8_locale_invlist) {
1099 _invlist_union_maybe_complement_2nd(invlist,
1100 only_utf8_locale_invlist,
1101 ANYOF_FLAGS(node) & ANYOF_INVERT,
1108 /* These two functions currently do the exact same thing */
1109 #define ssc_init_zero ssc_init
1111 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1112 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1114 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1115 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
1116 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1119 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1120 const regnode_charclass *and_with)
1122 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1123 * another SSC or a regular ANYOF class. Can create false positives. */
1128 PERL_ARGS_ASSERT_SSC_AND;
1130 assert(is_ANYOF_SYNTHETIC(ssc));
1132 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1133 * the code point inversion list and just the relevant flags */
1134 if (is_ANYOF_SYNTHETIC(and_with)) {
1135 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1136 anded_flags = ANYOF_FLAGS(and_with);
1138 /* XXX This is a kludge around what appears to be deficiencies in the
1139 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1140 * there are paths through the optimizer where it doesn't get weeded
1141 * out when it should. And if we don't make some extra provision for
1142 * it like the code just below, it doesn't get added when it should.
1143 * This solution is to add it only when AND'ing, which is here, and
1144 * only when what is being AND'ed is the pristine, original node
1145 * matching anything. Thus it is like adding it to ssc_anything() but
1146 * only when the result is to be AND'ed. Probably the same solution
1147 * could be adopted for the same problem we have with /l matching,
1148 * which is solved differently in S_ssc_init(), and that would lead to
1149 * fewer false positives than that solution has. But if this solution
1150 * creates bugs, the consequences are only that a warning isn't raised
1151 * that should be; while the consequences for having /l bugs is
1152 * incorrect matches */
1153 if (ssc_is_anything((regnode_ssc *)and_with)) {
1154 anded_flags |= ANYOF_WARN_SUPER;
1158 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1159 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1162 ANYOF_FLAGS(ssc) &= anded_flags;
1164 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1165 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1166 * 'and_with' may be inverted. When not inverted, we have the situation of
1168 * (C1 | P1) & (C2 | P2)
1169 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1170 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1172 * <= ((C1 & C2) | P1 | P2)
1173 * Alternatively, the last few steps could be:
1174 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1175 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1176 * <= (C1 | C2 | (P1 & P2))
1177 * We favor the second approach if either P1 or P2 is non-empty. This is
1178 * because these components are a barrier to doing optimizations, as what
1179 * they match cannot be known until the moment of matching as they are
1180 * dependent on the current locale, 'AND"ing them likely will reduce or
1182 * But we can do better if we know that C1,P1 are in their initial state (a
1183 * frequent occurrence), each matching everything:
1184 * (<everything>) & (C2 | P2) = C2 | P2
1185 * Similarly, if C2,P2 are in their initial state (again a frequent
1186 * occurrence), the result is a no-op
1187 * (C1 | P1) & (<everything>) = C1 | P1
1190 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1191 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1192 * <= (C1 & ~C2) | (P1 & ~P2)
1195 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1196 && ! is_ANYOF_SYNTHETIC(and_with))
1200 ssc_intersection(ssc,
1202 FALSE /* Has already been inverted */
1205 /* If either P1 or P2 is empty, the intersection will be also; can skip
1207 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1208 ANYOF_POSIXL_ZERO(ssc);
1210 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1212 /* Note that the Posix class component P from 'and_with' actually
1214 * P = Pa | Pb | ... | Pn
1215 * where each component is one posix class, such as in [\w\s].
1217 * ~P = ~(Pa | Pb | ... | Pn)
1218 * = ~Pa & ~Pb & ... & ~Pn
1219 * <= ~Pa | ~Pb | ... | ~Pn
1220 * The last is something we can easily calculate, but unfortunately
1221 * is likely to have many false positives. We could do better
1222 * in some (but certainly not all) instances if two classes in
1223 * P have known relationships. For example
1224 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1226 * :lower: & :print: = :lower:
1227 * And similarly for classes that must be disjoint. For example,
1228 * since \s and \w can have no elements in common based on rules in
1229 * the POSIX standard,
1230 * \w & ^\S = nothing
1231 * Unfortunately, some vendor locales do not meet the Posix
1232 * standard, in particular almost everything by Microsoft.
1233 * The loop below just changes e.g., \w into \W and vice versa */
1235 regnode_charclass_posixl temp;
1236 int add = 1; /* To calculate the index of the complement */
1238 ANYOF_POSIXL_ZERO(&temp);
1239 for (i = 0; i < ANYOF_MAX; i++) {
1241 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1242 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1244 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1245 ANYOF_POSIXL_SET(&temp, i + add);
1247 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1249 ANYOF_POSIXL_AND(&temp, ssc);
1251 } /* else ssc already has no posixes */
1252 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1253 in its initial state */
1254 else if (! is_ANYOF_SYNTHETIC(and_with)
1255 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1257 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1258 * copy it over 'ssc' */
1259 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1260 if (is_ANYOF_SYNTHETIC(and_with)) {
1261 StructCopy(and_with, ssc, regnode_ssc);
1264 ssc->invlist = anded_cp_list;
1265 ANYOF_POSIXL_ZERO(ssc);
1266 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1267 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1271 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1272 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1274 /* One or the other of P1, P2 is non-empty. */
1275 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1276 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1278 ssc_union(ssc, anded_cp_list, FALSE);
1280 else { /* P1 = P2 = empty */
1281 ssc_intersection(ssc, anded_cp_list, FALSE);
1287 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1288 const regnode_charclass *or_with)
1290 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1291 * another SSC or a regular ANYOF class. Can create false positives if
1292 * 'or_with' is to be inverted. */
1297 PERL_ARGS_ASSERT_SSC_OR;
1299 assert(is_ANYOF_SYNTHETIC(ssc));
1301 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1302 * the code point inversion list and just the relevant flags */
1303 if (is_ANYOF_SYNTHETIC(or_with)) {
1304 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1305 ored_flags = ANYOF_FLAGS(or_with);
1308 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1309 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1312 ANYOF_FLAGS(ssc) |= ored_flags;
1314 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1315 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1316 * 'or_with' may be inverted. When not inverted, we have the simple
1317 * situation of computing:
1318 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1319 * If P1|P2 yields a situation with both a class and its complement are
1320 * set, like having both \w and \W, this matches all code points, and we
1321 * can delete these from the P component of the ssc going forward. XXX We
1322 * might be able to delete all the P components, but I (khw) am not certain
1323 * about this, and it is better to be safe.
1326 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1327 * <= (C1 | P1) | ~C2
1328 * <= (C1 | ~C2) | P1
1329 * (which results in actually simpler code than the non-inverted case)
1332 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1333 && ! is_ANYOF_SYNTHETIC(or_with))
1335 /* We ignore P2, leaving P1 going forward */
1336 } /* else Not inverted */
1337 else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
1338 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1339 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1341 for (i = 0; i < ANYOF_MAX; i += 2) {
1342 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1344 ssc_match_all_cp(ssc);
1345 ANYOF_POSIXL_CLEAR(ssc, i);
1346 ANYOF_POSIXL_CLEAR(ssc, i+1);
1354 FALSE /* Already has been inverted */
1358 PERL_STATIC_INLINE void
1359 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1361 PERL_ARGS_ASSERT_SSC_UNION;
1363 assert(is_ANYOF_SYNTHETIC(ssc));
1365 _invlist_union_maybe_complement_2nd(ssc->invlist,
1371 PERL_STATIC_INLINE void
1372 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1374 const bool invert2nd)
1376 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1378 assert(is_ANYOF_SYNTHETIC(ssc));
1380 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1386 PERL_STATIC_INLINE void
1387 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1389 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1391 assert(is_ANYOF_SYNTHETIC(ssc));
1393 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1396 PERL_STATIC_INLINE void
1397 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1399 /* AND just the single code point 'cp' into the SSC 'ssc' */
1401 SV* cp_list = _new_invlist(2);
1403 PERL_ARGS_ASSERT_SSC_CP_AND;
1405 assert(is_ANYOF_SYNTHETIC(ssc));
1407 cp_list = add_cp_to_invlist(cp_list, cp);
1408 ssc_intersection(ssc, cp_list,
1409 FALSE /* Not inverted */
1411 SvREFCNT_dec_NN(cp_list);
1414 PERL_STATIC_INLINE void
1415 S_ssc_clear_locale(regnode_ssc *ssc)
1417 /* Set the SSC 'ssc' to not match any locale things */
1418 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1420 assert(is_ANYOF_SYNTHETIC(ssc));
1422 ANYOF_POSIXL_ZERO(ssc);
1423 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1427 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1429 /* The inversion list in the SSC is marked mortal; now we need a more
1430 * permanent copy, which is stored the same way that is done in a regular
1431 * ANYOF node, with the first 256 code points in a bit map */
1433 SV* invlist = invlist_clone(ssc->invlist);
1435 PERL_ARGS_ASSERT_SSC_FINALIZE;
1437 assert(is_ANYOF_SYNTHETIC(ssc));
1439 /* The code in this file assumes that all but these flags aren't relevant
1440 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1441 * time we reach here */
1442 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1444 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1446 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1447 NULL, NULL, NULL, FALSE);
1449 /* Make sure is clone-safe */
1450 ssc->invlist = NULL;
1452 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1453 ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
1456 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1459 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1460 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1461 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1462 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1463 ? (TRIE_LIST_CUR( idx ) - 1) \
1469 dump_trie(trie,widecharmap,revcharmap)
1470 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1471 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1473 These routines dump out a trie in a somewhat readable format.
1474 The _interim_ variants are used for debugging the interim
1475 tables that are used to generate the final compressed
1476 representation which is what dump_trie expects.
1478 Part of the reason for their existence is to provide a form
1479 of documentation as to how the different representations function.
1484 Dumps the final compressed table form of the trie to Perl_debug_log.
1485 Used for debugging make_trie().
1489 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1490 AV *revcharmap, U32 depth)
1493 SV *sv=sv_newmortal();
1494 int colwidth= widecharmap ? 6 : 4;
1496 GET_RE_DEBUG_FLAGS_DECL;
1498 PERL_ARGS_ASSERT_DUMP_TRIE;
1500 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1501 (int)depth * 2 + 2,"",
1502 "Match","Base","Ofs" );
1504 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1505 SV ** const tmp = av_fetch( revcharmap, state, 0);
1507 PerlIO_printf( Perl_debug_log, "%*s",
1509 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1510 PL_colors[0], PL_colors[1],
1511 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1512 PERL_PV_ESCAPE_FIRSTCHAR
1517 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1518 (int)depth * 2 + 2,"");
1520 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1521 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1522 PerlIO_printf( Perl_debug_log, "\n");
1524 for( state = 1 ; state < trie->statecount ; state++ ) {
1525 const U32 base = trie->states[ state ].trans.base;
1527 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1528 (int)depth * 2 + 2,"", (UV)state);
1530 if ( trie->states[ state ].wordnum ) {
1531 PerlIO_printf( Perl_debug_log, " W%4X",
1532 trie->states[ state ].wordnum );
1534 PerlIO_printf( Perl_debug_log, "%6s", "" );
1537 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1542 while( ( base + ofs < trie->uniquecharcount ) ||
1543 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1544 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1548 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1550 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1551 if ( ( base + ofs >= trie->uniquecharcount )
1552 && ( base + ofs - trie->uniquecharcount
1554 && trie->trans[ base + ofs
1555 - trie->uniquecharcount ].check == state )
1557 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1559 (UV)trie->trans[ base + ofs
1560 - trie->uniquecharcount ].next );
1562 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1566 PerlIO_printf( Perl_debug_log, "]");
1569 PerlIO_printf( Perl_debug_log, "\n" );
1571 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1573 for (word=1; word <= trie->wordcount; word++) {
1574 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1575 (int)word, (int)(trie->wordinfo[word].prev),
1576 (int)(trie->wordinfo[word].len));
1578 PerlIO_printf(Perl_debug_log, "\n" );
1581 Dumps a fully constructed but uncompressed trie in list form.
1582 List tries normally only are used for construction when the number of
1583 possible chars (trie->uniquecharcount) is very high.
1584 Used for debugging make_trie().
1587 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1588 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1592 SV *sv=sv_newmortal();
1593 int colwidth= widecharmap ? 6 : 4;
1594 GET_RE_DEBUG_FLAGS_DECL;
1596 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1598 /* print out the table precompression. */
1599 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1600 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1601 "------:-----+-----------------\n" );
1603 for( state=1 ; state < next_alloc ; state ++ ) {
1606 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1607 (int)depth * 2 + 2,"", (UV)state );
1608 if ( ! trie->states[ state ].wordnum ) {
1609 PerlIO_printf( Perl_debug_log, "%5s| ","");
1611 PerlIO_printf( Perl_debug_log, "W%4x| ",
1612 trie->states[ state ].wordnum
1615 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1616 SV ** const tmp = av_fetch( revcharmap,
1617 TRIE_LIST_ITEM(state,charid).forid, 0);
1619 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1621 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1623 PL_colors[0], PL_colors[1],
1624 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1625 | PERL_PV_ESCAPE_FIRSTCHAR
1627 TRIE_LIST_ITEM(state,charid).forid,
1628 (UV)TRIE_LIST_ITEM(state,charid).newstate
1631 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1632 (int)((depth * 2) + 14), "");
1635 PerlIO_printf( Perl_debug_log, "\n");
1640 Dumps a fully constructed but uncompressed trie in table form.
1641 This is the normal DFA style state transition table, with a few
1642 twists to facilitate compression later.
1643 Used for debugging make_trie().
1646 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1647 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1652 SV *sv=sv_newmortal();
1653 int colwidth= widecharmap ? 6 : 4;
1654 GET_RE_DEBUG_FLAGS_DECL;
1656 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1659 print out the table precompression so that we can do a visual check
1660 that they are identical.
1663 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1665 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1666 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1668 PerlIO_printf( Perl_debug_log, "%*s",
1670 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1671 PL_colors[0], PL_colors[1],
1672 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1673 PERL_PV_ESCAPE_FIRSTCHAR
1679 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1681 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1682 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1685 PerlIO_printf( Perl_debug_log, "\n" );
1687 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1689 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1690 (int)depth * 2 + 2,"",
1691 (UV)TRIE_NODENUM( state ) );
1693 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1694 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1696 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1698 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1700 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1701 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1702 (UV)trie->trans[ state ].check );
1704 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1705 (UV)trie->trans[ state ].check,
1706 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1714 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1715 startbranch: the first branch in the whole branch sequence
1716 first : start branch of sequence of branch-exact nodes.
1717 May be the same as startbranch
1718 last : Thing following the last branch.
1719 May be the same as tail.
1720 tail : item following the branch sequence
1721 count : words in the sequence
1722 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1723 depth : indent depth
1725 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1727 A trie is an N'ary tree where the branches are determined by digital
1728 decomposition of the key. IE, at the root node you look up the 1st character and
1729 follow that branch repeat until you find the end of the branches. Nodes can be
1730 marked as "accepting" meaning they represent a complete word. Eg:
1734 would convert into the following structure. Numbers represent states, letters
1735 following numbers represent valid transitions on the letter from that state, if
1736 the number is in square brackets it represents an accepting state, otherwise it
1737 will be in parenthesis.
1739 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1743 (1) +-i->(6)-+-s->[7]
1745 +-s->(3)-+-h->(4)-+-e->[5]
1747 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1749 This shows that when matching against the string 'hers' we will begin at state 1
1750 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1751 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1752 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1753 single traverse. We store a mapping from accepting to state to which word was
1754 matched, and then when we have multiple possibilities we try to complete the
1755 rest of the regex in the order in which they occured in the alternation.
1757 The only prior NFA like behaviour that would be changed by the TRIE support is
1758 the silent ignoring of duplicate alternations which are of the form:
1760 / (DUPE|DUPE) X? (?{ ... }) Y /x
1762 Thus EVAL blocks following a trie may be called a different number of times with
1763 and without the optimisation. With the optimisations dupes will be silently
1764 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1765 the following demonstrates:
1767 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1769 which prints out 'word' three times, but
1771 'words'=~/(word|word|word)(?{ print $1 })S/
1773 which doesnt print it out at all. This is due to other optimisations kicking in.
1775 Example of what happens on a structural level:
1777 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1779 1: CURLYM[1] {1,32767}(18)
1790 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1791 and should turn into:
1793 1: CURLYM[1] {1,32767}(18)
1795 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1803 Cases where tail != last would be like /(?foo|bar)baz/:
1813 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1814 and would end up looking like:
1817 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1824 d = uvchr_to_utf8_flags(d, uv, 0);
1826 is the recommended Unicode-aware way of saying
1831 #define TRIE_STORE_REVCHAR(val) \
1834 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1835 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1836 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1837 SvCUR_set(zlopp, kapow - flrbbbbb); \
1840 av_push(revcharmap, zlopp); \
1842 char ooooff = (char)val; \
1843 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1847 /* This gets the next character from the input, folding it if not already
1849 #define TRIE_READ_CHAR STMT_START { \
1852 /* if it is UTF then it is either already folded, or does not need \
1854 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1856 else if (folder == PL_fold_latin1) { \
1857 /* This folder implies Unicode rules, which in the range expressible \
1858 * by not UTF is the lower case, with the two exceptions, one of \
1859 * which should have been taken care of before calling this */ \
1860 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1861 uvc = toLOWER_L1(*uc); \
1862 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1865 /* raw data, will be folded later if needed */ \
1873 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1874 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1875 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1876 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1878 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1879 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1880 TRIE_LIST_CUR( state )++; \
1883 #define TRIE_LIST_NEW(state) STMT_START { \
1884 Newxz( trie->states[ state ].trans.list, \
1885 4, reg_trie_trans_le ); \
1886 TRIE_LIST_CUR( state ) = 1; \
1887 TRIE_LIST_LEN( state ) = 4; \
1890 #define TRIE_HANDLE_WORD(state) STMT_START { \
1891 U16 dupe= trie->states[ state ].wordnum; \
1892 regnode * const noper_next = regnext( noper ); \
1895 /* store the word for dumping */ \
1897 if (OP(noper) != NOTHING) \
1898 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1900 tmp = newSVpvn_utf8( "", 0, UTF ); \
1901 av_push( trie_words, tmp ); \
1905 trie->wordinfo[curword].prev = 0; \
1906 trie->wordinfo[curword].len = wordlen; \
1907 trie->wordinfo[curword].accept = state; \
1909 if ( noper_next < tail ) { \
1911 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1913 trie->jump[curword] = (U16)(noper_next - convert); \
1915 jumper = noper_next; \
1917 nextbranch= regnext(cur); \
1921 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1922 /* chain, so that when the bits of chain are later */\
1923 /* linked together, the dups appear in the chain */\
1924 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1925 trie->wordinfo[dupe].prev = curword; \
1927 /* we haven't inserted this word yet. */ \
1928 trie->states[ state ].wordnum = curword; \
1933 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1934 ( ( base + charid >= ucharcount \
1935 && base + charid < ubound \
1936 && state == trie->trans[ base - ucharcount + charid ].check \
1937 && trie->trans[ base - ucharcount + charid ].next ) \
1938 ? trie->trans[ base - ucharcount + charid ].next \
1939 : ( state==1 ? special : 0 ) \
1943 #define MADE_JUMP_TRIE 2
1944 #define MADE_EXACT_TRIE 4
1947 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1948 regnode *first, regnode *last, regnode *tail,
1949 U32 word_count, U32 flags, U32 depth)
1952 /* first pass, loop through and scan words */
1953 reg_trie_data *trie;
1954 HV *widecharmap = NULL;
1955 AV *revcharmap = newAV();
1961 regnode *jumper = NULL;
1962 regnode *nextbranch = NULL;
1963 regnode *convert = NULL;
1964 U32 *prev_states; /* temp array mapping each state to previous one */
1965 /* we just use folder as a flag in utf8 */
1966 const U8 * folder = NULL;
1969 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1970 AV *trie_words = NULL;
1971 /* along with revcharmap, this only used during construction but both are
1972 * useful during debugging so we store them in the struct when debugging.
1975 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1976 STRLEN trie_charcount=0;
1978 SV *re_trie_maxbuff;
1979 GET_RE_DEBUG_FLAGS_DECL;
1981 PERL_ARGS_ASSERT_MAKE_TRIE;
1983 PERL_UNUSED_ARG(depth);
1990 case EXACTFU: folder = PL_fold_latin1; break;
1991 case EXACTF: folder = PL_fold; break;
1992 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1995 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
1997 trie->startstate = 1;
1998 trie->wordcount = word_count;
1999 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2000 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2002 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2003 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2004 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2007 trie_words = newAV();
2010 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2011 assert(re_trie_maxbuff);
2012 if (!SvIOK(re_trie_maxbuff)) {
2013 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2015 DEBUG_TRIE_COMPILE_r({
2016 PerlIO_printf( Perl_debug_log,
2017 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2018 (int)depth * 2 + 2, "",
2019 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2020 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2023 /* Find the node we are going to overwrite */
2024 if ( first == startbranch && OP( last ) != BRANCH ) {
2025 /* whole branch chain */
2028 /* branch sub-chain */
2029 convert = NEXTOPER( first );
2032 /* -- First loop and Setup --
2034 We first traverse the branches and scan each word to determine if it
2035 contains widechars, and how many unique chars there are, this is
2036 important as we have to build a table with at least as many columns as we
2039 We use an array of integers to represent the character codes 0..255
2040 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2041 the native representation of the character value as the key and IV's for
2044 *TODO* If we keep track of how many times each character is used we can
2045 remap the columns so that the table compression later on is more
2046 efficient in terms of memory by ensuring the most common value is in the
2047 middle and the least common are on the outside. IMO this would be better
2048 than a most to least common mapping as theres a decent chance the most
2049 common letter will share a node with the least common, meaning the node
2050 will not be compressible. With a middle is most common approach the worst
2051 case is when we have the least common nodes twice.
2055 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2056 regnode *noper = NEXTOPER( cur );
2057 const U8 *uc = (U8*)STRING( noper );
2058 const U8 *e = uc + STR_LEN( noper );
2060 U32 wordlen = 0; /* required init */
2061 STRLEN minchars = 0;
2062 STRLEN maxchars = 0;
2063 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2066 if (OP(noper) == NOTHING) {
2067 regnode *noper_next= regnext(noper);
2068 if (noper_next != tail && OP(noper_next) == flags) {
2070 uc= (U8*)STRING(noper);
2071 e= uc + STR_LEN(noper);
2072 trie->minlen= STR_LEN(noper);
2079 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2080 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2081 regardless of encoding */
2082 if (OP( noper ) == EXACTFU_SS) {
2083 /* false positives are ok, so just set this */
2084 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2087 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2089 TRIE_CHARCOUNT(trie)++;
2092 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2093 * is in effect. Under /i, this character can match itself, or
2094 * anything that folds to it. If not under /i, it can match just
2095 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2096 * all fold to k, and all are single characters. But some folds
2097 * expand to more than one character, so for example LATIN SMALL
2098 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2099 * the string beginning at 'uc' is 'ffi', it could be matched by
2100 * three characters, or just by the one ligature character. (It
2101 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2102 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2103 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2104 * match.) The trie needs to know the minimum and maximum number
2105 * of characters that could match so that it can use size alone to
2106 * quickly reject many match attempts. The max is simple: it is
2107 * the number of folded characters in this branch (since a fold is
2108 * never shorter than what folds to it. */
2112 /* And the min is equal to the max if not under /i (indicated by
2113 * 'folder' being NULL), or there are no multi-character folds. If
2114 * there is a multi-character fold, the min is incremented just
2115 * once, for the character that folds to the sequence. Each
2116 * character in the sequence needs to be added to the list below of
2117 * characters in the trie, but we count only the first towards the
2118 * min number of characters needed. This is done through the
2119 * variable 'foldlen', which is returned by the macros that look
2120 * for these sequences as the number of bytes the sequence
2121 * occupies. Each time through the loop, we decrement 'foldlen' by
2122 * how many bytes the current char occupies. Only when it reaches
2123 * 0 do we increment 'minchars' or look for another multi-character
2125 if (folder == NULL) {
2128 else if (foldlen > 0) {
2129 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2134 /* See if *uc is the beginning of a multi-character fold. If
2135 * so, we decrement the length remaining to look at, to account
2136 * for the current character this iteration. (We can use 'uc'
2137 * instead of the fold returned by TRIE_READ_CHAR because for
2138 * non-UTF, the latin1_safe macro is smart enough to account
2139 * for all the unfolded characters, and because for UTF, the
2140 * string will already have been folded earlier in the
2141 * compilation process */
2143 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2144 foldlen -= UTF8SKIP(uc);
2147 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2152 /* The current character (and any potential folds) should be added
2153 * to the possible matching characters for this position in this
2157 U8 folded= folder[ (U8) uvc ];
2158 if ( !trie->charmap[ folded ] ) {
2159 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2160 TRIE_STORE_REVCHAR( folded );
2163 if ( !trie->charmap[ uvc ] ) {
2164 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2165 TRIE_STORE_REVCHAR( uvc );
2168 /* store the codepoint in the bitmap, and its folded
2170 TRIE_BITMAP_SET(trie, uvc);
2172 /* store the folded codepoint */
2173 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2176 /* store first byte of utf8 representation of
2177 variant codepoints */
2178 if (! UVCHR_IS_INVARIANT(uvc)) {
2179 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2182 set_bit = 0; /* We've done our bit :-) */
2186 /* XXX We could come up with the list of code points that fold
2187 * to this using PL_utf8_foldclosures, except not for
2188 * multi-char folds, as there may be multiple combinations
2189 * there that could work, which needs to wait until runtime to
2190 * resolve (The comment about LIGATURE FFI above is such an
2195 widecharmap = newHV();
2197 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2200 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2202 if ( !SvTRUE( *svpp ) ) {
2203 sv_setiv( *svpp, ++trie->uniquecharcount );
2204 TRIE_STORE_REVCHAR(uvc);
2207 } /* end loop through characters in this branch of the trie */
2209 /* We take the min and max for this branch and combine to find the min
2210 * and max for all branches processed so far */
2211 if( cur == first ) {
2212 trie->minlen = minchars;
2213 trie->maxlen = maxchars;
2214 } else if (minchars < trie->minlen) {
2215 trie->minlen = minchars;
2216 } else if (maxchars > trie->maxlen) {
2217 trie->maxlen = maxchars;
2219 } /* end first pass */
2220 DEBUG_TRIE_COMPILE_r(
2221 PerlIO_printf( Perl_debug_log,
2222 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2223 (int)depth * 2 + 2,"",
2224 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2225 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2226 (int)trie->minlen, (int)trie->maxlen )
2230 We now know what we are dealing with in terms of unique chars and
2231 string sizes so we can calculate how much memory a naive
2232 representation using a flat table will take. If it's over a reasonable
2233 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2234 conservative but potentially much slower representation using an array
2237 At the end we convert both representations into the same compressed
2238 form that will be used in regexec.c for matching with. The latter
2239 is a form that cannot be used to construct with but has memory
2240 properties similar to the list form and access properties similar
2241 to the table form making it both suitable for fast searches and
2242 small enough that its feasable to store for the duration of a program.
2244 See the comment in the code where the compressed table is produced
2245 inplace from the flat tabe representation for an explanation of how
2246 the compression works.
2251 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2254 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2255 > SvIV(re_trie_maxbuff) )
2258 Second Pass -- Array Of Lists Representation
2260 Each state will be represented by a list of charid:state records
2261 (reg_trie_trans_le) the first such element holds the CUR and LEN
2262 points of the allocated array. (See defines above).
2264 We build the initial structure using the lists, and then convert
2265 it into the compressed table form which allows faster lookups
2266 (but cant be modified once converted).
2269 STRLEN transcount = 1;
2271 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2272 "%*sCompiling trie using list compiler\n",
2273 (int)depth * 2 + 2, ""));
2275 trie->states = (reg_trie_state *)
2276 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2277 sizeof(reg_trie_state) );
2281 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2283 regnode *noper = NEXTOPER( cur );
2284 U8 *uc = (U8*)STRING( noper );
2285 const U8 *e = uc + STR_LEN( noper );
2286 U32 state = 1; /* required init */
2287 U16 charid = 0; /* sanity init */
2288 U32 wordlen = 0; /* required init */
2290 if (OP(noper) == NOTHING) {
2291 regnode *noper_next= regnext(noper);
2292 if (noper_next != tail && OP(noper_next) == flags) {
2294 uc= (U8*)STRING(noper);
2295 e= uc + STR_LEN(noper);
2299 if (OP(noper) != NOTHING) {
2300 for ( ; uc < e ; uc += len ) {
2305 charid = trie->charmap[ uvc ];
2307 SV** const svpp = hv_fetch( widecharmap,
2314 charid=(U16)SvIV( *svpp );
2317 /* charid is now 0 if we dont know the char read, or
2318 * nonzero if we do */
2325 if ( !trie->states[ state ].trans.list ) {
2326 TRIE_LIST_NEW( state );
2329 check <= TRIE_LIST_USED( state );
2332 if ( TRIE_LIST_ITEM( state, check ).forid
2335 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2340 newstate = next_alloc++;
2341 prev_states[newstate] = state;
2342 TRIE_LIST_PUSH( state, charid, newstate );
2347 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2351 TRIE_HANDLE_WORD(state);
2353 } /* end second pass */
2355 /* next alloc is the NEXT state to be allocated */
2356 trie->statecount = next_alloc;
2357 trie->states = (reg_trie_state *)
2358 PerlMemShared_realloc( trie->states,
2360 * sizeof(reg_trie_state) );
2362 /* and now dump it out before we compress it */
2363 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2364 revcharmap, next_alloc,
2368 trie->trans = (reg_trie_trans *)
2369 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2376 for( state=1 ; state < next_alloc ; state ++ ) {
2380 DEBUG_TRIE_COMPILE_MORE_r(
2381 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2385 if (trie->states[state].trans.list) {
2386 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2390 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2391 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2392 if ( forid < minid ) {
2394 } else if ( forid > maxid ) {
2398 if ( transcount < tp + maxid - minid + 1) {
2400 trie->trans = (reg_trie_trans *)
2401 PerlMemShared_realloc( trie->trans,
2403 * sizeof(reg_trie_trans) );
2404 Zero( trie->trans + (transcount / 2),
2408 base = trie->uniquecharcount + tp - minid;
2409 if ( maxid == minid ) {
2411 for ( ; zp < tp ; zp++ ) {
2412 if ( ! trie->trans[ zp ].next ) {
2413 base = trie->uniquecharcount + zp - minid;
2414 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2416 trie->trans[ zp ].check = state;
2422 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2424 trie->trans[ tp ].check = state;
2429 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2430 const U32 tid = base
2431 - trie->uniquecharcount
2432 + TRIE_LIST_ITEM( state, idx ).forid;
2433 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2435 trie->trans[ tid ].check = state;
2437 tp += ( maxid - minid + 1 );
2439 Safefree(trie->states[ state ].trans.list);
2442 DEBUG_TRIE_COMPILE_MORE_r(
2443 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2446 trie->states[ state ].trans.base=base;
2448 trie->lasttrans = tp + 1;
2452 Second Pass -- Flat Table Representation.
2454 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2455 each. We know that we will need Charcount+1 trans at most to store
2456 the data (one row per char at worst case) So we preallocate both
2457 structures assuming worst case.
2459 We then construct the trie using only the .next slots of the entry
2462 We use the .check field of the first entry of the node temporarily
2463 to make compression both faster and easier by keeping track of how
2464 many non zero fields are in the node.
2466 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2469 There are two terms at use here: state as a TRIE_NODEIDX() which is
2470 a number representing the first entry of the node, and state as a
2471 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2472 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2473 if there are 2 entrys per node. eg:
2481 The table is internally in the right hand, idx form. However as we
2482 also have to deal with the states array which is indexed by nodenum
2483 we have to use TRIE_NODENUM() to convert.
2486 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2487 "%*sCompiling trie using table compiler\n",
2488 (int)depth * 2 + 2, ""));
2490 trie->trans = (reg_trie_trans *)
2491 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2492 * trie->uniquecharcount + 1,
2493 sizeof(reg_trie_trans) );
2494 trie->states = (reg_trie_state *)
2495 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2496 sizeof(reg_trie_state) );
2497 next_alloc = trie->uniquecharcount + 1;
2500 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2502 regnode *noper = NEXTOPER( cur );
2503 const U8 *uc = (U8*)STRING( noper );
2504 const U8 *e = uc + STR_LEN( noper );
2506 U32 state = 1; /* required init */
2508 U16 charid = 0; /* sanity init */
2509 U32 accept_state = 0; /* sanity init */
2511 U32 wordlen = 0; /* required init */
2513 if (OP(noper) == NOTHING) {
2514 regnode *noper_next= regnext(noper);
2515 if (noper_next != tail && OP(noper_next) == flags) {
2517 uc= (U8*)STRING(noper);
2518 e= uc + STR_LEN(noper);
2522 if ( OP(noper) != NOTHING ) {
2523 for ( ; uc < e ; uc += len ) {
2528 charid = trie->charmap[ uvc ];
2530 SV* const * const svpp = hv_fetch( widecharmap,
2534 charid = svpp ? (U16)SvIV(*svpp) : 0;
2538 if ( !trie->trans[ state + charid ].next ) {
2539 trie->trans[ state + charid ].next = next_alloc;
2540 trie->trans[ state ].check++;
2541 prev_states[TRIE_NODENUM(next_alloc)]
2542 = TRIE_NODENUM(state);
2543 next_alloc += trie->uniquecharcount;
2545 state = trie->trans[ state + charid ].next;
2547 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2549 /* charid is now 0 if we dont know the char read, or
2550 * nonzero if we do */
2553 accept_state = TRIE_NODENUM( state );
2554 TRIE_HANDLE_WORD(accept_state);
2556 } /* end second pass */
2558 /* and now dump it out before we compress it */
2559 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2561 next_alloc, depth+1));
2565 * Inplace compress the table.*
2567 For sparse data sets the table constructed by the trie algorithm will
2568 be mostly 0/FAIL transitions or to put it another way mostly empty.
2569 (Note that leaf nodes will not contain any transitions.)
2571 This algorithm compresses the tables by eliminating most such
2572 transitions, at the cost of a modest bit of extra work during lookup:
2574 - Each states[] entry contains a .base field which indicates the
2575 index in the state[] array wheres its transition data is stored.
2577 - If .base is 0 there are no valid transitions from that node.
2579 - If .base is nonzero then charid is added to it to find an entry in
2582 -If trans[states[state].base+charid].check!=state then the
2583 transition is taken to be a 0/Fail transition. Thus if there are fail
2584 transitions at the front of the node then the .base offset will point
2585 somewhere inside the previous nodes data (or maybe even into a node
2586 even earlier), but the .check field determines if the transition is
2590 The following process inplace converts the table to the compressed
2591 table: We first do not compress the root node 1,and mark all its
2592 .check pointers as 1 and set its .base pointer as 1 as well. This
2593 allows us to do a DFA construction from the compressed table later,
2594 and ensures that any .base pointers we calculate later are greater
2597 - We set 'pos' to indicate the first entry of the second node.
2599 - We then iterate over the columns of the node, finding the first and
2600 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2601 and set the .check pointers accordingly, and advance pos
2602 appropriately and repreat for the next node. Note that when we copy
2603 the next pointers we have to convert them from the original
2604 NODEIDX form to NODENUM form as the former is not valid post
2607 - If a node has no transitions used we mark its base as 0 and do not
2608 advance the pos pointer.
2610 - If a node only has one transition we use a second pointer into the
2611 structure to fill in allocated fail transitions from other states.
2612 This pointer is independent of the main pointer and scans forward
2613 looking for null transitions that are allocated to a state. When it
2614 finds one it writes the single transition into the "hole". If the
2615 pointer doesnt find one the single transition is appended as normal.
2617 - Once compressed we can Renew/realloc the structures to release the
2620 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2621 specifically Fig 3.47 and the associated pseudocode.
2625 const U32 laststate = TRIE_NODENUM( next_alloc );
2628 trie->statecount = laststate;
2630 for ( state = 1 ; state < laststate ; state++ ) {
2632 const U32 stateidx = TRIE_NODEIDX( state );
2633 const U32 o_used = trie->trans[ stateidx ].check;
2634 U32 used = trie->trans[ stateidx ].check;
2635 trie->trans[ stateidx ].check = 0;
2638 used && charid < trie->uniquecharcount;
2641 if ( flag || trie->trans[ stateidx + charid ].next ) {
2642 if ( trie->trans[ stateidx + charid ].next ) {
2644 for ( ; zp < pos ; zp++ ) {
2645 if ( ! trie->trans[ zp ].next ) {
2649 trie->states[ state ].trans.base
2651 + trie->uniquecharcount
2653 trie->trans[ zp ].next
2654 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2656 trie->trans[ zp ].check = state;
2657 if ( ++zp > pos ) pos = zp;
2664 trie->states[ state ].trans.base
2665 = pos + trie->uniquecharcount - charid ;
2667 trie->trans[ pos ].next
2668 = SAFE_TRIE_NODENUM(
2669 trie->trans[ stateidx + charid ].next );
2670 trie->trans[ pos ].check = state;
2675 trie->lasttrans = pos + 1;
2676 trie->states = (reg_trie_state *)
2677 PerlMemShared_realloc( trie->states, laststate
2678 * sizeof(reg_trie_state) );
2679 DEBUG_TRIE_COMPILE_MORE_r(
2680 PerlIO_printf( Perl_debug_log,
2681 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2682 (int)depth * 2 + 2,"",
2683 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2687 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2690 } /* end table compress */
2692 DEBUG_TRIE_COMPILE_MORE_r(
2693 PerlIO_printf(Perl_debug_log,
2694 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2695 (int)depth * 2 + 2, "",
2696 (UV)trie->statecount,
2697 (UV)trie->lasttrans)
2699 /* resize the trans array to remove unused space */
2700 trie->trans = (reg_trie_trans *)
2701 PerlMemShared_realloc( trie->trans, trie->lasttrans
2702 * sizeof(reg_trie_trans) );
2704 { /* Modify the program and insert the new TRIE node */
2705 U8 nodetype =(U8)(flags & 0xFF);
2709 regnode *optimize = NULL;
2710 #ifdef RE_TRACK_PATTERN_OFFSETS
2713 U32 mjd_nodelen = 0;
2714 #endif /* RE_TRACK_PATTERN_OFFSETS */
2715 #endif /* DEBUGGING */
2717 This means we convert either the first branch or the first Exact,
2718 depending on whether the thing following (in 'last') is a branch
2719 or not and whther first is the startbranch (ie is it a sub part of
2720 the alternation or is it the whole thing.)
2721 Assuming its a sub part we convert the EXACT otherwise we convert
2722 the whole branch sequence, including the first.
2724 /* Find the node we are going to overwrite */
2725 if ( first != startbranch || OP( last ) == BRANCH ) {
2726 /* branch sub-chain */
2727 NEXT_OFF( first ) = (U16)(last - first);
2728 #ifdef RE_TRACK_PATTERN_OFFSETS
2730 mjd_offset= Node_Offset((convert));
2731 mjd_nodelen= Node_Length((convert));
2734 /* whole branch chain */
2736 #ifdef RE_TRACK_PATTERN_OFFSETS
2739 const regnode *nop = NEXTOPER( convert );
2740 mjd_offset= Node_Offset((nop));
2741 mjd_nodelen= Node_Length((nop));
2745 PerlIO_printf(Perl_debug_log,
2746 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2747 (int)depth * 2 + 2, "",
2748 (UV)mjd_offset, (UV)mjd_nodelen)
2751 /* But first we check to see if there is a common prefix we can
2752 split out as an EXACT and put in front of the TRIE node. */
2753 trie->startstate= 1;
2754 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2756 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2760 const U32 base = trie->states[ state ].trans.base;
2762 if ( trie->states[state].wordnum )
2765 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2766 if ( ( base + ofs >= trie->uniquecharcount ) &&
2767 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2768 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2770 if ( ++count > 1 ) {
2771 SV **tmp = av_fetch( revcharmap, ofs, 0);
2772 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2773 if ( state == 1 ) break;
2775 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2777 PerlIO_printf(Perl_debug_log,
2778 "%*sNew Start State=%"UVuf" Class: [",
2779 (int)depth * 2 + 2, "",
2782 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2783 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2785 TRIE_BITMAP_SET(trie,*ch);
2787 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2789 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2793 TRIE_BITMAP_SET(trie,*ch);
2795 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2796 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2802 SV **tmp = av_fetch( revcharmap, idx, 0);
2804 char *ch = SvPV( *tmp, len );
2806 SV *sv=sv_newmortal();
2807 PerlIO_printf( Perl_debug_log,
2808 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2809 (int)depth * 2 + 2, "",
2811 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2812 PL_colors[0], PL_colors[1],
2813 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2814 PERL_PV_ESCAPE_FIRSTCHAR
2819 OP( convert ) = nodetype;
2820 str=STRING(convert);
2823 STR_LEN(convert) += len;
2829 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2834 trie->prefixlen = (state-1);
2836 regnode *n = convert+NODE_SZ_STR(convert);
2837 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2838 trie->startstate = state;
2839 trie->minlen -= (state - 1);
2840 trie->maxlen -= (state - 1);
2842 /* At least the UNICOS C compiler choked on this
2843 * being argument to DEBUG_r(), so let's just have
2846 #ifdef PERL_EXT_RE_BUILD
2852 regnode *fix = convert;
2853 U32 word = trie->wordcount;
2855 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2856 while( ++fix < n ) {
2857 Set_Node_Offset_Length(fix, 0, 0);
2860 SV ** const tmp = av_fetch( trie_words, word, 0 );
2862 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2863 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2865 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2873 NEXT_OFF(convert) = (U16)(tail - convert);
2874 DEBUG_r(optimize= n);
2880 if ( trie->maxlen ) {
2881 NEXT_OFF( convert ) = (U16)(tail - convert);
2882 ARG_SET( convert, data_slot );
2883 /* Store the offset to the first unabsorbed branch in
2884 jump[0], which is otherwise unused by the jump logic.
2885 We use this when dumping a trie and during optimisation. */
2887 trie->jump[0] = (U16)(nextbranch - convert);
2889 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2890 * and there is a bitmap
2891 * and the first "jump target" node we found leaves enough room
2892 * then convert the TRIE node into a TRIEC node, with the bitmap
2893 * embedded inline in the opcode - this is hypothetically faster.
2895 if ( !trie->states[trie->startstate].wordnum
2897 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2899 OP( convert ) = TRIEC;
2900 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2901 PerlMemShared_free(trie->bitmap);
2904 OP( convert ) = TRIE;
2906 /* store the type in the flags */
2907 convert->flags = nodetype;
2911 + regarglen[ OP( convert ) ];
2913 /* XXX We really should free up the resource in trie now,
2914 as we won't use them - (which resources?) dmq */
2916 /* needed for dumping*/
2917 DEBUG_r(if (optimize) {
2918 regnode *opt = convert;
2920 while ( ++opt < optimize) {
2921 Set_Node_Offset_Length(opt,0,0);
2924 Try to clean up some of the debris left after the
2927 while( optimize < jumper ) {
2928 mjd_nodelen += Node_Length((optimize));
2929 OP( optimize ) = OPTIMIZED;
2930 Set_Node_Offset_Length(optimize,0,0);
2933 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2935 } /* end node insert */
2936 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2938 /* Finish populating the prev field of the wordinfo array. Walk back
2939 * from each accept state until we find another accept state, and if
2940 * so, point the first word's .prev field at the second word. If the
2941 * second already has a .prev field set, stop now. This will be the
2942 * case either if we've already processed that word's accept state,
2943 * or that state had multiple words, and the overspill words were
2944 * already linked up earlier.
2951 for (word=1; word <= trie->wordcount; word++) {
2953 if (trie->wordinfo[word].prev)
2955 state = trie->wordinfo[word].accept;
2957 state = prev_states[state];
2960 prev = trie->states[state].wordnum;
2964 trie->wordinfo[word].prev = prev;
2966 Safefree(prev_states);
2970 /* and now dump out the compressed format */
2971 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2973 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2975 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2976 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2978 SvREFCNT_dec_NN(revcharmap);
2982 : trie->startstate>1
2988 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
2990 /* The Trie is constructed and compressed now so we can build a fail array if
2993 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2995 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2999 We find the fail state for each state in the trie, this state is the longest
3000 proper suffix of the current state's 'word' that is also a proper prefix of
3001 another word in our trie. State 1 represents the word '' and is thus the
3002 default fail state. This allows the DFA not to have to restart after its
3003 tried and failed a word at a given point, it simply continues as though it
3004 had been matching the other word in the first place.
3006 'abcdgu'=~/abcdefg|cdgu/
3007 When we get to 'd' we are still matching the first word, we would encounter
3008 'g' which would fail, which would bring us to the state representing 'd' in
3009 the second word where we would try 'g' and succeed, proceeding to match
3012 /* add a fail transition */
3013 const U32 trie_offset = ARG(source);
3014 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3016 const U32 ucharcount = trie->uniquecharcount;
3017 const U32 numstates = trie->statecount;
3018 const U32 ubound = trie->lasttrans + ucharcount;
3022 U32 base = trie->states[ 1 ].trans.base;
3025 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3027 GET_RE_DEBUG_FLAGS_DECL;
3029 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3030 PERL_UNUSED_CONTEXT;
3032 PERL_UNUSED_ARG(depth);
3035 if ( OP(source) == TRIE ) {
3036 struct regnode_1 *op = (struct regnode_1 *)
3037 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3038 StructCopy(source,op,struct regnode_1);
3039 stclass = (regnode *)op;
3041 struct regnode_charclass *op = (struct regnode_charclass *)
3042 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3043 StructCopy(source,op,struct regnode_charclass);
3044 stclass = (regnode *)op;
3046 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3048 ARG_SET( stclass, data_slot );
3049 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3050 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3051 aho->trie=trie_offset;
3052 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3053 Copy( trie->states, aho->states, numstates, reg_trie_state );
3054 Newxz( q, numstates, U32);
3055 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3058 /* initialize fail[0..1] to be 1 so that we always have
3059 a valid final fail state */
3060 fail[ 0 ] = fail[ 1 ] = 1;
3062 for ( charid = 0; charid < ucharcount ; charid++ ) {
3063 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3065 q[ q_write ] = newstate;
3066 /* set to point at the root */
3067 fail[ q[ q_write++ ] ]=1;
3070 while ( q_read < q_write) {
3071 const U32 cur = q[ q_read++ % numstates ];
3072 base = trie->states[ cur ].trans.base;
3074 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3075 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3077 U32 fail_state = cur;
3080 fail_state = fail[ fail_state ];
3081 fail_base = aho->states[ fail_state ].trans.base;
3082 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3084 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3085 fail[ ch_state ] = fail_state;
3086 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3088 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3090 q[ q_write++ % numstates] = ch_state;
3094 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3095 when we fail in state 1, this allows us to use the
3096 charclass scan to find a valid start char. This is based on the principle
3097 that theres a good chance the string being searched contains lots of stuff
3098 that cant be a start char.
3100 fail[ 0 ] = fail[ 1 ] = 0;
3101 DEBUG_TRIE_COMPILE_r({
3102 PerlIO_printf(Perl_debug_log,
3103 "%*sStclass Failtable (%"UVuf" states): 0",
3104 (int)(depth * 2), "", (UV)numstates
3106 for( q_read=1; q_read<numstates; q_read++ ) {
3107 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3109 PerlIO_printf(Perl_debug_log, "\n");
3112 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3117 #define DEBUG_PEEP(str,scan,depth) \
3118 DEBUG_OPTIMISE_r({if (scan){ \
3119 SV * const mysv=sv_newmortal(); \
3120 regnode *Next = regnext(scan); \
3121 regprop(RExC_rx, mysv, scan, NULL); \
3122 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3123 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3124 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3128 /* The below joins as many adjacent EXACTish nodes as possible into a single
3129 * one. The regop may be changed if the node(s) contain certain sequences that
3130 * require special handling. The joining is only done if:
3131 * 1) there is room in the current conglomerated node to entirely contain the
3133 * 2) they are the exact same node type
3135 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3136 * these get optimized out
3138 * If a node is to match under /i (folded), the number of characters it matches
3139 * can be different than its character length if it contains a multi-character
3140 * fold. *min_subtract is set to the total delta number of characters of the
3143 * And *unfolded_multi_char is set to indicate whether or not the node contains
3144 * an unfolded multi-char fold. This happens when whether the fold is valid or
3145 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3146 * SMALL LETTER SHARP S, as only if the target string being matched against
3147 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3148 * folding rules depend on the locale in force at runtime. (Multi-char folds
3149 * whose components are all above the Latin1 range are not run-time locale
3150 * dependent, and have already been folded by the time this function is
3153 * This is as good a place as any to discuss the design of handling these
3154 * multi-character fold sequences. It's been wrong in Perl for a very long
3155 * time. There are three code points in Unicode whose multi-character folds
3156 * were long ago discovered to mess things up. The previous designs for
3157 * dealing with these involved assigning a special node for them. This
3158 * approach doesn't always work, as evidenced by this example:
3159 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3160 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3161 * would match just the \xDF, it won't be able to handle the case where a
3162 * successful match would have to cross the node's boundary. The new approach
3163 * that hopefully generally solves the problem generates an EXACTFU_SS node
3164 * that is "sss" in this case.
3166 * It turns out that there are problems with all multi-character folds, and not
3167 * just these three. Now the code is general, for all such cases. The
3168 * approach taken is:
3169 * 1) This routine examines each EXACTFish node that could contain multi-
3170 * character folded sequences. Since a single character can fold into
3171 * such a sequence, the minimum match length for this node is less than
3172 * the number of characters in the node. This routine returns in
3173 * *min_subtract how many characters to subtract from the the actual
3174 * length of the string to get a real minimum match length; it is 0 if
3175 * there are no multi-char foldeds. This delta is used by the caller to
3176 * adjust the min length of the match, and the delta between min and max,
3177 * so that the optimizer doesn't reject these possibilities based on size
3179 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3180 * is used for an EXACTFU node that contains at least one "ss" sequence in
3181 * it. For non-UTF-8 patterns and strings, this is the only case where
3182 * there is a possible fold length change. That means that a regular
3183 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3184 * with length changes, and so can be processed faster. regexec.c takes
3185 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3186 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3187 * known until runtime). This saves effort in regex matching. However,
3188 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3189 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3190 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3191 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3192 * possibilities for the non-UTF8 patterns are quite simple, except for
3193 * the sharp s. All the ones that don't involve a UTF-8 target string are
3194 * members of a fold-pair, and arrays are set up for all of them so that
3195 * the other member of the pair can be found quickly. Code elsewhere in
3196 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3197 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3198 * described in the next item.
3199 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3200 * validity of the fold won't be known until runtime, and so must remain
3201 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3202 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3203 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3204 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3205 * The reason this is a problem is that the optimizer part of regexec.c
3206 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3207 * that a character in the pattern corresponds to at most a single
3208 * character in the target string. (And I do mean character, and not byte
3209 * here, unlike other parts of the documentation that have never been
3210 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3211 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3212 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3213 * nodes, violate the assumption, and they are the only instances where it
3214 * is violated. I'm reluctant to try to change the assumption, as the
3215 * code involved is impenetrable to me (khw), so instead the code here
3216 * punts. This routine examines EXACTFL nodes, and (when the pattern
3217 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3218 * boolean indicating whether or not the node contains such a fold. When
3219 * it is true, the caller sets a flag that later causes the optimizer in
3220 * this file to not set values for the floating and fixed string lengths,
3221 * and thus avoids the optimizer code in regexec.c that makes the invalid
3222 * assumption. Thus, there is no optimization based on string lengths for
3223 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3224 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3225 * assumption is wrong only in these cases is that all other non-UTF-8
3226 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3227 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3228 * EXACTF nodes because we don't know at compile time if it actually
3229 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3230 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3231 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3232 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3233 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3234 * string would require the pattern to be forced into UTF-8, the overhead
3235 * of which we want to avoid. Similarly the unfolded multi-char folds in
3236 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3239 * Similarly, the code that generates tries doesn't currently handle
3240 * not-already-folded multi-char folds, and it looks like a pain to change
3241 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3242 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3243 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3244 * using /iaa matching will be doing so almost entirely with ASCII
3245 * strings, so this should rarely be encountered in practice */
3247 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3248 if (PL_regkind[OP(scan)] == EXACT) \
3249 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3252 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3253 UV *min_subtract, bool *unfolded_multi_char,
3254 U32 flags,regnode *val, U32 depth)
3256 /* Merge several consecutive EXACTish nodes into one. */
3257 regnode *n = regnext(scan);
3259 regnode *next = scan + NODE_SZ_STR(scan);
3263 regnode *stop = scan;
3264 GET_RE_DEBUG_FLAGS_DECL;
3266 PERL_UNUSED_ARG(depth);
3269 PERL_ARGS_ASSERT_JOIN_EXACT;
3270 #ifndef EXPERIMENTAL_INPLACESCAN
3271 PERL_UNUSED_ARG(flags);
3272 PERL_UNUSED_ARG(val);
3274 DEBUG_PEEP("join",scan,depth);
3276 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3277 * EXACT ones that are mergeable to the current one. */
3279 && (PL_regkind[OP(n)] == NOTHING
3280 || (stringok && OP(n) == OP(scan)))
3282 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3285 if (OP(n) == TAIL || n > next)
3287 if (PL_regkind[OP(n)] == NOTHING) {
3288 DEBUG_PEEP("skip:",n,depth);
3289 NEXT_OFF(scan) += NEXT_OFF(n);
3290 next = n + NODE_STEP_REGNODE;
3297 else if (stringok) {
3298 const unsigned int oldl = STR_LEN(scan);
3299 regnode * const nnext = regnext(n);
3301 /* XXX I (khw) kind of doubt that this works on platforms (should
3302 * Perl ever run on one) where U8_MAX is above 255 because of lots
3303 * of other assumptions */
3304 /* Don't join if the sum can't fit into a single node */
3305 if (oldl + STR_LEN(n) > U8_MAX)
3308 DEBUG_PEEP("merg",n,depth);
3311 NEXT_OFF(scan) += NEXT_OFF(n);
3312 STR_LEN(scan) += STR_LEN(n);
3313 next = n + NODE_SZ_STR(n);
3314 /* Now we can overwrite *n : */
3315 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3323 #ifdef EXPERIMENTAL_INPLACESCAN
3324 if (flags && !NEXT_OFF(n)) {
3325 DEBUG_PEEP("atch", val, depth);
3326 if (reg_off_by_arg[OP(n)]) {
3327 ARG_SET(n, val - n);
3330 NEXT_OFF(n) = val - n;
3338 *unfolded_multi_char = FALSE;
3340 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3341 * can now analyze for sequences of problematic code points. (Prior to
3342 * this final joining, sequences could have been split over boundaries, and
3343 * hence missed). The sequences only happen in folding, hence for any
3344 * non-EXACT EXACTish node */
3345 if (OP(scan) != EXACT) {
3346 U8* s0 = (U8*) STRING(scan);
3348 U8* s_end = s0 + STR_LEN(scan);
3350 int total_count_delta = 0; /* Total delta number of characters that
3351 multi-char folds expand to */
3353 /* One pass is made over the node's string looking for all the
3354 * possibilities. To avoid some tests in the loop, there are two main
3355 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3360 if (OP(scan) == EXACTFL) {
3363 /* An EXACTFL node would already have been changed to another
3364 * node type unless there is at least one character in it that
3365 * is problematic; likely a character whose fold definition
3366 * won't be known until runtime, and so has yet to be folded.
3367 * For all but the UTF-8 locale, folds are 1-1 in length, but
3368 * to handle the UTF-8 case, we need to create a temporary
3369 * folded copy using UTF-8 locale rules in order to analyze it.
3370 * This is because our macros that look to see if a sequence is
3371 * a multi-char fold assume everything is folded (otherwise the
3372 * tests in those macros would be too complicated and slow).
3373 * Note that here, the non-problematic folds will have already
3374 * been done, so we can just copy such characters. We actually
3375 * don't completely fold the EXACTFL string. We skip the
3376 * unfolded multi-char folds, as that would just create work
3377 * below to figure out the size they already are */
3379 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3382 STRLEN s_len = UTF8SKIP(s);
3383 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3384 Copy(s, d, s_len, U8);
3387 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3388 *unfolded_multi_char = TRUE;
3389 Copy(s, d, s_len, U8);
3392 else if (isASCII(*s)) {
3393 *(d++) = toFOLD(*s);
3397 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3403 /* Point the remainder of the routine to look at our temporary
3407 } /* End of creating folded copy of EXACTFL string */
3409 /* Examine the string for a multi-character fold sequence. UTF-8
3410 * patterns have all characters pre-folded by the time this code is
3412 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3413 length sequence we are looking for is 2 */
3415 int count = 0; /* How many characters in a multi-char fold */
3416 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3417 if (! len) { /* Not a multi-char fold: get next char */
3422 /* Nodes with 'ss' require special handling, except for
3423 * EXACTFA-ish for which there is no multi-char fold to this */
3424 if (len == 2 && *s == 's' && *(s+1) == 's'
3425 && OP(scan) != EXACTFA
3426 && OP(scan) != EXACTFA_NO_TRIE)
3429 if (OP(scan) != EXACTFL) {
3430 OP(scan) = EXACTFU_SS;
3434 else { /* Here is a generic multi-char fold. */
3435 U8* multi_end = s + len;
3437 /* Count how many characters are in it. In the case of
3438 * /aa, no folds which contain ASCII code points are
3439 * allowed, so check for those, and skip if found. */
3440 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3441 count = utf8_length(s, multi_end);
3445 while (s < multi_end) {
3448 goto next_iteration;
3458 /* The delta is how long the sequence is minus 1 (1 is how long
3459 * the character that folds to the sequence is) */
3460 total_count_delta += count - 1;
3464 /* We created a temporary folded copy of the string in EXACTFL
3465 * nodes. Therefore we need to be sure it doesn't go below zero,
3466 * as the real string could be shorter */
3467 if (OP(scan) == EXACTFL) {
3468 int total_chars = utf8_length((U8*) STRING(scan),
3469 (U8*) STRING(scan) + STR_LEN(scan));
3470 if (total_count_delta > total_chars) {
3471 total_count_delta = total_chars;
3475 *min_subtract += total_count_delta;
3478 else if (OP(scan) == EXACTFA) {
3480 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3481 * fold to the ASCII range (and there are no existing ones in the
3482 * upper latin1 range). But, as outlined in the comments preceding
3483 * this function, we need to flag any occurrences of the sharp s.
3484 * This character forbids trie formation (because of added
3487 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3488 OP(scan) = EXACTFA_NO_TRIE;
3489 *unfolded_multi_char = TRUE;
3498 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3499 * folds that are all Latin1. As explained in the comments
3500 * preceding this function, we look also for the sharp s in EXACTF
3501 * and EXACTFL nodes; it can be in the final position. Otherwise
3502 * we can stop looking 1 byte earlier because have to find at least
3503 * two characters for a multi-fold */
3504 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3509 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3510 if (! len) { /* Not a multi-char fold. */
3511 if (*s == LATIN_SMALL_LETTER_SHARP_S
3512 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3514 *unfolded_multi_char = TRUE;
3521 && isARG2_lower_or_UPPER_ARG1('s', *s)
3522 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
3525 /* EXACTF nodes need to know that the minimum length
3526 * changed so that a sharp s in the string can match this
3527 * ss in the pattern, but they remain EXACTF nodes, as they
3528 * won't match this unless the target string is is UTF-8,
3529 * which we don't know until runtime. EXACTFL nodes can't
3530 * transform into EXACTFU nodes */
3531 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3532 OP(scan) = EXACTFU_SS;
3536 *min_subtract += len - 1;
3543 /* Allow dumping but overwriting the collection of skipped
3544 * ops and/or strings with fake optimized ops */
3545 n = scan + NODE_SZ_STR(scan);
3553 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3557 /* REx optimizer. Converts nodes into quicker variants "in place".
3558 Finds fixed substrings. */
3560 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3561 to the position after last scanned or to NULL. */
3563 #define INIT_AND_WITHP \
3564 assert(!and_withp); \
3565 Newx(and_withp,1, regnode_ssc); \
3566 SAVEFREEPV(and_withp)
3568 /* this is a chain of data about sub patterns we are processing that
3569 need to be handled separately/specially in study_chunk. Its so
3570 we can simulate recursion without losing state. */
3572 typedef struct scan_frame {
3573 regnode *last; /* last node to process in this frame */
3574 regnode *next; /* next node to process when last is reached */
3575 struct scan_frame *prev; /*previous frame*/
3576 U32 prev_recursed_depth;
3577 I32 stop; /* what stopparen do we use */
3582 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3583 SSize_t *minlenp, SSize_t *deltap,
3588 regnode_ssc *and_withp,
3589 U32 flags, U32 depth)
3590 /* scanp: Start here (read-write). */
3591 /* deltap: Write maxlen-minlen here. */
3592 /* last: Stop before this one. */
3593 /* data: string data about the pattern */
3594 /* stopparen: treat close N as END */
3595 /* recursed: which subroutines have we recursed into */
3596 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3599 /* There must be at least this number of characters to match */
3602 regnode *scan = *scanp, *next;
3604 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3605 int is_inf_internal = 0; /* The studied chunk is infinite */
3606 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3607 scan_data_t data_fake;
3608 SV *re_trie_maxbuff = NULL;
3609 regnode *first_non_open = scan;
3610 SSize_t stopmin = SSize_t_MAX;
3611 scan_frame *frame = NULL;
3612 GET_RE_DEBUG_FLAGS_DECL;
3614 PERL_ARGS_ASSERT_STUDY_CHUNK;
3617 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3620 while (first_non_open && OP(first_non_open) == OPEN)
3621 first_non_open=regnext(first_non_open);
3626 while ( scan && OP(scan) != END && scan < last ){
3627 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3628 node length to get a real minimum (because
3629 the folded version may be shorter) */
3630 bool unfolded_multi_char = FALSE;
3631 /* Peephole optimizer: */
3632 DEBUG_OPTIMISE_MORE_r(
3634 PerlIO_printf(Perl_debug_log,
3635 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3636 ((int) depth*2), "", (long)stopparen,
3637 (unsigned long)depth, (unsigned long)recursed_depth);
3638 if (recursed_depth) {
3641 for ( j = 0 ; j < recursed_depth ; j++ ) {
3642 PerlIO_printf(Perl_debug_log,"[");
3643 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3644 PerlIO_printf(Perl_debug_log,"%d",
3645 PAREN_TEST(RExC_study_chunk_recursed +
3646 (j * RExC_study_chunk_recursed_bytes), i)
3649 PerlIO_printf(Perl_debug_log,"]");
3652 PerlIO_printf(Perl_debug_log,"\n");
3655 DEBUG_STUDYDATA("Peep:", data, depth);
3656 DEBUG_PEEP("Peep", scan, depth);
3659 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3660 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3661 * by a different invocation of reg() -- Yves
3663 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3665 /* Follow the next-chain of the current node and optimize
3666 away all the NOTHINGs from it. */
3667 if (OP(scan) != CURLYX) {
3668 const int max = (reg_off_by_arg[OP(scan)]
3670 /* I32 may be smaller than U16 on CRAYs! */
3671 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3672 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3676 /* Skip NOTHING and LONGJMP. */
3677 while ((n = regnext(n))
3678 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3679 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3680 && off + noff < max)
3682 if (reg_off_by_arg[OP(scan)])
3685 NEXT_OFF(scan) = off;
3690 /* The principal pseudo-switch. Cannot be a switch, since we
3691 look into several different things. */
3692 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3693 || OP(scan) == IFTHEN) {
3694 next = regnext(scan);
3696 /* demq: the op(next)==code check is to see if we have
3697 * "branch-branch" AFAICT */
3699 if (OP(next) == code || code == IFTHEN) {
3700 /* NOTE - There is similar code to this block below for
3701 * handling TRIE nodes on a re-study. If you change stuff here
3702 * check there too. */
3703 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3705 regnode * const startbranch=scan;
3707 if (flags & SCF_DO_SUBSTR) {
3708 /* Cannot merge strings after this. */
3709 scan_commit(pRExC_state, data, minlenp, is_inf);
3712 if (flags & SCF_DO_STCLASS)
3713 ssc_init_zero(pRExC_state, &accum);
3715 while (OP(scan) == code) {
3716 SSize_t deltanext, minnext, fake;
3718 regnode_ssc this_class;
3721 data_fake.flags = 0;
3723 data_fake.whilem_c = data->whilem_c;
3724 data_fake.last_closep = data->last_closep;
3727 data_fake.last_closep = &fake;
3729 data_fake.pos_delta = delta;
3730 next = regnext(scan);
3731 scan = NEXTOPER(scan);
3733 scan = NEXTOPER(scan);
3734 if (flags & SCF_DO_STCLASS) {
3735 ssc_init(pRExC_state, &this_class);
3736 data_fake.start_class = &this_class;
3737 f = SCF_DO_STCLASS_AND;
3739 if (flags & SCF_WHILEM_VISITED_POS)
3740 f |= SCF_WHILEM_VISITED_POS;
3742 /* we suppose the run is continuous, last=next...*/
3743 minnext = study_chunk(pRExC_state, &scan, minlenp,
3744 &deltanext, next, &data_fake, stopparen,
3745 recursed_depth, NULL, f,depth+1);
3748 if (deltanext == SSize_t_MAX) {
3749 is_inf = is_inf_internal = 1;
3751 } else if (max1 < minnext + deltanext)
3752 max1 = minnext + deltanext;
3754 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3756 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3757 if ( stopmin > minnext)
3758 stopmin = min + min1;
3759 flags &= ~SCF_DO_SUBSTR;
3761 data->flags |= SCF_SEEN_ACCEPT;
3764 if (data_fake.flags & SF_HAS_EVAL)
3765 data->flags |= SF_HAS_EVAL;
3766 data->whilem_c = data_fake.whilem_c;
3768 if (flags & SCF_DO_STCLASS)
3769 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3771 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3773 if (flags & SCF_DO_SUBSTR) {
3774 data->pos_min += min1;
3775 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3776 data->pos_delta = SSize_t_MAX;
3778 data->pos_delta += max1 - min1;
3779 if (max1 != min1 || is_inf)
3780 data->longest = &(data->longest_float);
3783 if (delta == SSize_t_MAX
3784 || SSize_t_MAX - delta - (max1 - min1) < 0)
3785 delta = SSize_t_MAX;
3787 delta += max1 - min1;
3788 if (flags & SCF_DO_STCLASS_OR) {
3789 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3791 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3792 flags &= ~SCF_DO_STCLASS;
3795 else if (flags & SCF_DO_STCLASS_AND) {
3797 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3798 flags &= ~SCF_DO_STCLASS;
3801 /* Switch to OR mode: cache the old value of
3802 * data->start_class */
3804 StructCopy(data->start_class, and_withp, regnode_ssc);
3805 flags &= ~SCF_DO_STCLASS_AND;
3806 StructCopy(&accum, data->start_class, regnode_ssc);
3807 flags |= SCF_DO_STCLASS_OR;
3811 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3812 OP( startbranch ) == BRANCH )
3816 Assuming this was/is a branch we are dealing with: 'scan'
3817 now points at the item that follows the branch sequence,
3818 whatever it is. We now start at the beginning of the
3819 sequence and look for subsequences of
3825 which would be constructed from a pattern like
3828 If we can find such a subsequence we need to turn the first
3829 element into a trie and then add the subsequent branch exact
3830 strings to the trie.
3834 1. patterns where the whole set of branches can be
3837 2. patterns where only a subset can be converted.
3839 In case 1 we can replace the whole set with a single regop
3840 for the trie. In case 2 we need to keep the start and end
3843 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3844 becomes BRANCH TRIE; BRANCH X;
3846 There is an additional case, that being where there is a
3847 common prefix, which gets split out into an EXACT like node
3848 preceding the TRIE node.
3850 If x(1..n)==tail then we can do a simple trie, if not we make
3851 a "jump" trie, such that when we match the appropriate word
3852 we "jump" to the appropriate tail node. Essentially we turn
3853 a nested if into a case structure of sorts.
3858 if (!re_trie_maxbuff) {
3859 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3860 if (!SvIOK(re_trie_maxbuff))
3861 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3863 if ( SvIV(re_trie_maxbuff)>=0 ) {
3865 regnode *first = (regnode *)NULL;
3866 regnode *last = (regnode *)NULL;
3867 regnode *tail = scan;
3872 SV * const mysv = sv_newmortal(); /* for dumping */
3874 /* var tail is used because there may be a TAIL
3875 regop in the way. Ie, the exacts will point to the
3876 thing following the TAIL, but the last branch will
3877 point at the TAIL. So we advance tail. If we
3878 have nested (?:) we may have to move through several
3882 while ( OP( tail ) == TAIL ) {
3883 /* this is the TAIL generated by (?:) */
3884 tail = regnext( tail );
3888 DEBUG_TRIE_COMPILE_r({
3889 regprop(RExC_rx, mysv, tail, NULL);
3890 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3891 (int)depth * 2 + 2, "",
3892 "Looking for TRIE'able sequences. Tail node is: ",
3893 SvPV_nolen_const( mysv )
3899 Step through the branches
3900 cur represents each branch,
3901 noper is the first thing to be matched as part
3903 noper_next is the regnext() of that node.
3905 We normally handle a case like this
3906 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3907 support building with NOJUMPTRIE, which restricts
3908 the trie logic to structures like /FOO|BAR/.
3910 If noper is a trieable nodetype then the branch is
3911 a possible optimization target. If we are building
3912 under NOJUMPTRIE then we require that noper_next is
3913 the same as scan (our current position in the regex
3916 Once we have two or more consecutive such branches
3917 we can create a trie of the EXACT's contents and
3918 stitch it in place into the program.
3920 If the sequence represents all of the branches in
3921 the alternation we replace the entire thing with a
3924 Otherwise when it is a subsequence we need to
3925 stitch it in place and replace only the relevant
3926 branches. This means the first branch has to remain
3927 as it is used by the alternation logic, and its
3928 next pointer, and needs to be repointed at the item
3929 on the branch chain following the last branch we
3930 have optimized away.
3932 This could be either a BRANCH, in which case the
3933 subsequence is internal, or it could be the item
3934 following the branch sequence in which case the
3935 subsequence is at the end (which does not
3936 necessarily mean the first node is the start of the
3939 TRIE_TYPE(X) is a define which maps the optype to a
3943 ----------------+-----------
3947 EXACTFU_SS | EXACTFU
3952 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3953 ( EXACT == (X) ) ? EXACT : \
3954 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3955 ( EXACTFA == (X) ) ? EXACTFA : \
3958 /* dont use tail as the end marker for this traverse */
3959 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3960 regnode * const noper = NEXTOPER( cur );
3961 U8 noper_type = OP( noper );
3962 U8 noper_trietype = TRIE_TYPE( noper_type );
3963 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3964 regnode * const noper_next = regnext( noper );
3965 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3966 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3969 DEBUG_TRIE_COMPILE_r({
3970 regprop(RExC_rx, mysv, cur, NULL);
3971 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3972 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3974 regprop(RExC_rx, mysv, noper, NULL);
3975 PerlIO_printf( Perl_debug_log, " -> %s",
3976 SvPV_nolen_const(mysv));
3979 regprop(RExC_rx, mysv, noper_next, NULL);
3980 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3981 SvPV_nolen_const(mysv));
3983 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3984 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3985 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3989 /* Is noper a trieable nodetype that can be merged
3990 * with the current trie (if there is one)? */
3994 ( noper_trietype == NOTHING)
3995 || ( trietype == NOTHING )
3996 || ( trietype == noper_trietype )
3999 && noper_next == tail
4003 /* Handle mergable triable node Either we are
4004 * the first node in a new trieable sequence,
4005 * in which case we do some bookkeeping,
4006 * otherwise we update the end pointer. */
4009 if ( noper_trietype == NOTHING ) {
4010 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4011 regnode * const noper_next = regnext( noper );
4012 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4013 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4016 if ( noper_next_trietype ) {
4017 trietype = noper_next_trietype;
4018 } else if (noper_next_type) {
4019 /* a NOTHING regop is 1 regop wide.
4020 * We need at least two for a trie
4021 * so we can't merge this in */
4025 trietype = noper_trietype;
4028 if ( trietype == NOTHING )
4029 trietype = noper_trietype;
4034 } /* end handle mergable triable node */
4036 /* handle unmergable node -
4037 * noper may either be a triable node which can
4038 * not be tried together with the current trie,
4039 * or a non triable node */
4041 /* If last is set and trietype is not
4042 * NOTHING then we have found at least two
4043 * triable branch sequences in a row of a
4044 * similar trietype so we can turn them
4045 * into a trie. If/when we allow NOTHING to
4046 * start a trie sequence this condition
4047 * will be required, and it isn't expensive
4048 * so we leave it in for now. */
4049 if ( trietype && trietype != NOTHING )
4050 make_trie( pRExC_state,
4051 startbranch, first, cur, tail,
4052 count, trietype, depth+1 );
4053 last = NULL; /* note: we clear/update
4054 first, trietype etc below,
4055 so we dont do it here */
4059 && noper_next == tail
4062 /* noper is triable, so we can start a new
4066 trietype = noper_trietype;
4068 /* if we already saw a first but the
4069 * current node is not triable then we have
4070 * to reset the first information. */
4075 } /* end handle unmergable node */
4076 } /* loop over branches */
4077 DEBUG_TRIE_COMPILE_r({
4078 regprop(RExC_rx, mysv, cur, NULL);
4079 PerlIO_printf( Perl_debug_log,
4080 "%*s- %s (%d) <SCAN FINISHED>\n",
4082 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4085 if ( last && trietype ) {
4086 if ( trietype != NOTHING ) {
4087 /* the last branch of the sequence was part of
4088 * a trie, so we have to construct it here
4089 * outside of the loop */
4090 made= make_trie( pRExC_state, startbranch,
4091 first, scan, tail, count,
4092 trietype, depth+1 );
4093 #ifdef TRIE_STUDY_OPT
4094 if ( ((made == MADE_EXACT_TRIE &&
4095 startbranch == first)
4096 || ( first_non_open == first )) &&
4098 flags |= SCF_TRIE_RESTUDY;
4099 if ( startbranch == first
4102 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4107 /* at this point we know whatever we have is a
4108 * NOTHING sequence/branch AND if 'startbranch'
4109 * is 'first' then we can turn the whole thing
4112 if ( startbranch == first ) {
4114 /* the entire thing is a NOTHING sequence,
4115 * something like this: (?:|) So we can
4116 * turn it into a plain NOTHING op. */
4117 DEBUG_TRIE_COMPILE_r({
4118 regprop(RExC_rx, mysv, cur, NULL);
4119 PerlIO_printf( Perl_debug_log,
4120 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4121 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4124 OP(startbranch)= NOTHING;
4125 NEXT_OFF(startbranch)= tail - startbranch;
4126 for ( opt= startbranch + 1; opt < tail ; opt++ )
4130 } /* end if ( last) */
4131 } /* TRIE_MAXBUF is non zero */
4136 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4137 scan = NEXTOPER(NEXTOPER(scan));
4138 } else /* single branch is optimized. */
4139 scan = NEXTOPER(scan);
4141 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4142 scan_frame *newframe = NULL;
4146 U32 my_recursed_depth= recursed_depth;
4148 if (OP(scan) != SUSPEND) {
4149 /* set the pointer */
4150 if (OP(scan) == GOSUB) {
4152 RExC_recurse[ARG2L(scan)] = scan;
4153 start = RExC_open_parens[paren-1];
4154 end = RExC_close_parens[paren-1];
4157 start = RExC_rxi->program + 1;
4162 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4164 if (!recursed_depth) {
4165 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4167 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4168 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4169 RExC_study_chunk_recursed_bytes, U8);
4171 /* we havent recursed into this paren yet, so recurse into it */
4172 DEBUG_STUDYDATA("set:", data,depth);
4173 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4174 my_recursed_depth= recursed_depth + 1;
4175 Newx(newframe,1,scan_frame);
4177 DEBUG_STUDYDATA("inf:", data,depth);
4178 /* some form of infinite recursion, assume infinite length
4180 if (flags & SCF_DO_SUBSTR) {
4181 scan_commit(pRExC_state, data, minlenp, is_inf);
4182 data->longest = &(data->longest_float);
4184 is_inf = is_inf_internal = 1;
4185 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4186 ssc_anything(data->start_class);
4187 flags &= ~SCF_DO_STCLASS;
4190 Newx(newframe,1,scan_frame);
4193 end = regnext(scan);
4198 SAVEFREEPV(newframe);
4199 newframe->next = regnext(scan);
4200 newframe->last = last;
4201 newframe->stop = stopparen;
4202 newframe->prev = frame;
4203 newframe->prev_recursed_depth = recursed_depth;
4205 DEBUG_STUDYDATA("frame-new:",data,depth);
4206 DEBUG_PEEP("fnew", scan, depth);
4213 recursed_depth= my_recursed_depth;
4218 else if (OP(scan) == EXACT) {
4219 SSize_t l = STR_LEN(scan);
4222 const U8 * const s = (U8*)STRING(scan);
4223 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4224 l = utf8_length(s, s + l);
4226 uc = *((U8*)STRING(scan));
4229 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4230 /* The code below prefers earlier match for fixed
4231 offset, later match for variable offset. */
4232 if (data->last_end == -1) { /* Update the start info. */
4233 data->last_start_min = data->pos_min;
4234 data->last_start_max = is_inf
4235 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4237 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4239 SvUTF8_on(data->last_found);
4241 SV * const sv = data->last_found;
4242 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4243 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4244 if (mg && mg->mg_len >= 0)
4245 mg->mg_len += utf8_length((U8*)STRING(scan),
4246 (U8*)STRING(scan)+STR_LEN(scan));
4248 data->last_end = data->pos_min + l;
4249 data->pos_min += l; /* As in the first entry. */
4250 data->flags &= ~SF_BEFORE_EOL;
4253 /* ANDing the code point leaves at most it, and not in locale, and
4254 * can't match null string */
4255 if (flags & SCF_DO_STCLASS_AND) {
4256 ssc_cp_and(data->start_class, uc);
4257 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4258 ssc_clear_locale(data->start_class);
4260 else if (flags & SCF_DO_STCLASS_OR) {
4261 ssc_add_cp(data->start_class, uc);
4262 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4264 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4265 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4267 flags &= ~SCF_DO_STCLASS;
4269 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4271 SSize_t l = STR_LEN(scan);
4272 UV uc = *((U8*)STRING(scan));
4273 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4274 separate code points */
4275 const U8 * s = (U8*)STRING(scan);
4277 /* Search for fixed substrings supports EXACT only. */
4278 if (flags & SCF_DO_SUBSTR) {
4280 scan_commit(pRExC_state, data, minlenp, is_inf);
4283 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4284 l = utf8_length(s, s + l);
4286 if (unfolded_multi_char) {
4287 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4289 min += l - min_subtract;
4291 delta += min_subtract;
4292 if (flags & SCF_DO_SUBSTR) {
4293 data->pos_min += l - min_subtract;
4294 if (data->pos_min < 0) {
4297 data->pos_delta += min_subtract;
4299 data->longest = &(data->longest_float);
4303 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4304 ssc_clear_locale(data->start_class);
4309 /* We punt and assume can match anything if the node begins
4310 * with a multi-character fold. Things are complicated. For
4311 * example, /ffi/i could match any of:
4312 * "\N{LATIN SMALL LIGATURE FFI}"
4313 * "\N{LATIN SMALL LIGATURE FF}I"
4314 * "F\N{LATIN SMALL LIGATURE FI}"
4315 * plus several other things; and making sure we have all the
4316 * possibilities is hard. */
4317 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4319 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4323 /* Any Latin1 range character can potentially match any
4324 * other depending on the locale */
4325 if (OP(scan) == EXACTFL) {
4326 _invlist_union(EXACTF_invlist, PL_Latin1,
4330 /* But otherwise, it matches at least itself. We can
4331 * quickly tell if it has a distinct fold, and if so,
4332 * it matches that as well */
4333 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4334 if (IS_IN_SOME_FOLD_L1(uc)) {
4335 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4336 PL_fold_latin1[uc]);
4340 /* Some characters match above-Latin1 ones under /i. This
4341 * is true of EXACTFL ones when the locale is UTF-8 */
4342 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4343 && (! isASCII(uc) || (OP(scan) != EXACTFA
4344 && OP(scan) != EXACTFA_NO_TRIE)))
4346 add_above_Latin1_folds(pRExC_state,
4352 else { /* Pattern is UTF-8 */
4353 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4354 STRLEN foldlen = UTF8SKIP(s);
4355 const U8* e = s + STR_LEN(scan);
4358 /* The only code points that aren't folded in a UTF EXACTFish
4359 * node are are the problematic ones in EXACTFL nodes */
4360 if (OP(scan) == EXACTFL
4361 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4363 /* We need to check for the possibility that this EXACTFL
4364 * node begins with a multi-char fold. Therefore we fold
4365 * the first few characters of it so that we can make that
4370 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4372 *(d++) = (U8) toFOLD(*s);
4377 to_utf8_fold(s, d, &len);
4383 /* And set up so the code below that looks in this folded
4384 * buffer instead of the node's string */
4386 foldlen = UTF8SKIP(folded);
4390 /* When we reach here 's' points to the fold of the first
4391 * character(s) of the node; and 'e' points to far enough along
4392 * the folded string to be just past any possible multi-char
4393 * fold. 'foldlen' is the length in bytes of the first
4396 * Unlike the non-UTF-8 case, the macro for determining if a
4397 * string is a multi-char fold requires all the characters to
4398 * already be folded. This is because of all the complications
4399 * if not. Note that they are folded anyway, except in EXACTFL
4400 * nodes. Like the non-UTF case above, we punt if the node
4401 * begins with a multi-char fold */
4403 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4405 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4407 else { /* Single char fold */
4409 /* It matches all the things that fold to it, which are
4410 * found in PL_utf8_foldclosures (including itself) */
4411 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4412 if (! PL_utf8_foldclosures) {
4413 _load_PL_utf8_foldclosures();
4415 if ((listp = hv_fetch(PL_utf8_foldclosures,
4416 (char *) s, foldlen, FALSE)))
4418 AV* list = (AV*) *listp;
4420 for (k = 0; k <= av_tindex(list); k++) {
4421 SV** c_p = av_fetch(list, k, FALSE);
4427 /* /aa doesn't allow folds between ASCII and non- */
4428 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4429 && isASCII(c) != isASCII(uc))
4434 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4439 if (flags & SCF_DO_STCLASS_AND) {
4440 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4441 ANYOF_POSIXL_ZERO(data->start_class);
4442 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4444 else if (flags & SCF_DO_STCLASS_OR) {
4445 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4446 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4448 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4449 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4451 flags &= ~SCF_DO_STCLASS;
4452 SvREFCNT_dec(EXACTF_invlist);
4454 else if (REGNODE_VARIES(OP(scan))) {
4455 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4456 I32 fl = 0, f = flags;
4457 regnode * const oscan = scan;
4458 regnode_ssc this_class;
4459 regnode_ssc *oclass = NULL;
4460 I32 next_is_eval = 0;
4462 switch (PL_regkind[OP(scan)]) {
4463 case WHILEM: /* End of (?:...)* . */
4464 scan = NEXTOPER(scan);
4467 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4468 next = NEXTOPER(scan);
4469 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4471 maxcount = REG_INFTY;
4472 next = regnext(scan);
4473 scan = NEXTOPER(scan);
4477 if (flags & SCF_DO_SUBSTR)
4482 if (flags & SCF_DO_STCLASS) {
4484 maxcount = REG_INFTY;
4485 next = regnext(scan);
4486 scan = NEXTOPER(scan);
4489 if (flags & SCF_DO_SUBSTR) {
4490 scan_commit(pRExC_state, data, minlenp, is_inf);
4491 /* Cannot extend fixed substrings */
4492 data->longest = &(data->longest_float);
4494 is_inf = is_inf_internal = 1;
4495 scan = regnext(scan);
4496 goto optimize_curly_tail;
4498 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4499 && (scan->flags == stopparen))
4504 mincount = ARG1(scan);
4505 maxcount = ARG2(scan);
4507 next = regnext(scan);
4508 if (OP(scan) == CURLYX) {
4509 I32 lp = (data ? *(data->last_closep) : 0);
4510 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4512 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4513 next_is_eval = (OP(scan) == EVAL);
4515 if (flags & SCF_DO_SUBSTR) {
4517 scan_commit(pRExC_state, data, minlenp, is_inf);
4518 /* Cannot extend fixed substrings */
4519 pos_before = data->pos_min;
4523 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4525 data->flags |= SF_IS_INF;
4527 if (flags & SCF_DO_STCLASS) {
4528 ssc_init(pRExC_state, &this_class);
4529 oclass = data->start_class;
4530 data->start_class = &this_class;
4531 f |= SCF_DO_STCLASS_AND;
4532 f &= ~SCF_DO_STCLASS_OR;
4534 /* Exclude from super-linear cache processing any {n,m}
4535 regops for which the combination of input pos and regex
4536 pos is not enough information to determine if a match
4539 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4540 regex pos at the \s*, the prospects for a match depend not
4541 only on the input position but also on how many (bar\s*)
4542 repeats into the {4,8} we are. */
4543 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4544 f &= ~SCF_WHILEM_VISITED_POS;
4546 /* This will finish on WHILEM, setting scan, or on NULL: */
4547 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4548 last, data, stopparen, recursed_depth, NULL,
4550 ? (f & ~SCF_DO_SUBSTR)
4554 if (flags & SCF_DO_STCLASS)
4555 data->start_class = oclass;
4556 if (mincount == 0 || minnext == 0) {
4557 if (flags & SCF_DO_STCLASS_OR) {
4558 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4560 else if (flags & SCF_DO_STCLASS_AND) {
4561 /* Switch to OR mode: cache the old value of
4562 * data->start_class */
4564 StructCopy(data->start_class, and_withp, regnode_ssc);
4565 flags &= ~SCF_DO_STCLASS_AND;
4566 StructCopy(&this_class, data->start_class, regnode_ssc);
4567 flags |= SCF_DO_STCLASS_OR;
4568 ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
4570 } else { /* Non-zero len */
4571 if (flags & SCF_DO_STCLASS_OR) {
4572 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4573 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4575 else if (flags & SCF_DO_STCLASS_AND)
4576 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4577 flags &= ~SCF_DO_STCLASS;
4579 if (!scan) /* It was not CURLYX, but CURLY. */
4581 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4582 /* ? quantifier ok, except for (?{ ... }) */
4583 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4584 && (minnext == 0) && (deltanext == 0)
4585 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4586 && maxcount <= REG_INFTY/3) /* Complement check for big
4589 /* Fatal warnings may leak the regexp without this: */
4590 SAVEFREESV(RExC_rx_sv);
4591 ckWARNreg(RExC_parse,
4592 "Quantifier unexpected on zero-length expression");
4593 (void)ReREFCNT_inc(RExC_rx_sv);
4596 min += minnext * mincount;
4597 is_inf_internal |= deltanext == SSize_t_MAX
4598 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4599 is_inf |= is_inf_internal;
4601 delta = SSize_t_MAX;
4603 delta += (minnext + deltanext) * maxcount
4604 - minnext * mincount;
4606 /* Try powerful optimization CURLYX => CURLYN. */
4607 if ( OP(oscan) == CURLYX && data
4608 && data->flags & SF_IN_PAR
4609 && !(data->flags & SF_HAS_EVAL)
4610 && !deltanext && minnext == 1 ) {
4611 /* Try to optimize to CURLYN. */
4612 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4613 regnode * const nxt1 = nxt;
4620 if (!REGNODE_SIMPLE(OP(nxt))
4621 && !(PL_regkind[OP(nxt)] == EXACT
4622 && STR_LEN(nxt) == 1))
4628 if (OP(nxt) != CLOSE)
4630 if (RExC_open_parens) {
4631 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4632 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4634 /* Now we know that nxt2 is the only contents: */
4635 oscan->flags = (U8)ARG(nxt);
4637 OP(nxt1) = NOTHING; /* was OPEN. */
4640 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4641 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4642 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4643 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4644 OP(nxt + 1) = OPTIMIZED; /* was count. */
4645 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4650 /* Try optimization CURLYX => CURLYM. */
4651 if ( OP(oscan) == CURLYX && data
4652 && !(data->flags & SF_HAS_PAR)
4653 && !(data->flags & SF_HAS_EVAL)
4654 && !deltanext /* atom is fixed width */
4655 && minnext != 0 /* CURLYM can't handle zero width */
4657 /* Nor characters whose fold at run-time may be
4658 * multi-character */
4659 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4661 /* XXXX How to optimize if data == 0? */
4662 /* Optimize to a simpler form. */
4663 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4667 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4668 && (OP(nxt2) != WHILEM))
4670 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4671 /* Need to optimize away parenths. */
4672 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4673 /* Set the parenth number. */
4674 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4676 oscan->flags = (U8)ARG(nxt);
4677 if (RExC_open_parens) {
4678 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4679 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4681 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4682 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4685 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4686 OP(nxt + 1) = OPTIMIZED; /* was count. */
4687 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4688 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4691 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4692 regnode *nnxt = regnext(nxt1);
4694 if (reg_off_by_arg[OP(nxt1)])
4695 ARG_SET(nxt1, nxt2 - nxt1);
4696 else if (nxt2 - nxt1 < U16_MAX)
4697 NEXT_OFF(nxt1) = nxt2 - nxt1;
4699 OP(nxt) = NOTHING; /* Cannot beautify */
4704 /* Optimize again: */
4705 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4706 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4711 else if ((OP(oscan) == CURLYX)
4712 && (flags & SCF_WHILEM_VISITED_POS)
4713 /* See the comment on a similar expression above.
4714 However, this time it's not a subexpression
4715 we care about, but the expression itself. */
4716 && (maxcount == REG_INFTY)
4717 && data && ++data->whilem_c < 16) {
4718 /* This stays as CURLYX, we can put the count/of pair. */
4719 /* Find WHILEM (as in regexec.c) */
4720 regnode *nxt = oscan + NEXT_OFF(oscan);
4722 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4724 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4725 | (RExC_whilem_seen << 4)); /* On WHILEM */
4727 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4729 if (flags & SCF_DO_SUBSTR) {
4730 SV *last_str = NULL;
4731 STRLEN last_chrs = 0;
4732 int counted = mincount != 0;
4734 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4736 SSize_t b = pos_before >= data->last_start_min
4737 ? pos_before : data->last_start_min;
4739 const char * const s = SvPV_const(data->last_found, l);
4740 SSize_t old = b - data->last_start_min;
4743 old = utf8_hop((U8*)s, old) - (U8*)s;
4745 /* Get the added string: */
4746 last_str = newSVpvn_utf8(s + old, l, UTF);
4747 last_chrs = UTF ? utf8_length((U8*)(s + old),
4748 (U8*)(s + old + l)) : l;
4749 if (deltanext == 0 && pos_before == b) {
4750 /* What was added is a constant string */
4753 SvGROW(last_str, (mincount * l) + 1);
4754 repeatcpy(SvPVX(last_str) + l,
4755 SvPVX_const(last_str), l,
4757 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4758 /* Add additional parts. */
4759 SvCUR_set(data->last_found,
4760 SvCUR(data->last_found) - l);
4761 sv_catsv(data->last_found, last_str);
4763 SV * sv = data->last_found;
4765 SvUTF8(sv) && SvMAGICAL(sv) ?
4766 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4767 if (mg && mg->mg_len >= 0)
4768 mg->mg_len += last_chrs * (mincount-1);
4770 last_chrs *= mincount;
4771 data->last_end += l * (mincount - 1);
4774 /* start offset must point into the last copy */
4775 data->last_start_min += minnext * (mincount - 1);
4776 data->last_start_max += is_inf ? SSize_t_MAX
4777 : (maxcount - 1) * (minnext + data->pos_delta);
4780 /* It is counted once already... */
4781 data->pos_min += minnext * (mincount - counted);
4783 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4784 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4785 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4786 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4788 if (deltanext != SSize_t_MAX)
4789 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4790 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4791 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4793 if (deltanext == SSize_t_MAX
4794 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4795 data->pos_delta = SSize_t_MAX;
4797 data->pos_delta += - counted * deltanext +
4798 (minnext + deltanext) * maxcount - minnext * mincount;
4799 if (mincount != maxcount) {
4800 /* Cannot extend fixed substrings found inside
4802 scan_commit(pRExC_state, data, minlenp, is_inf);
4803 if (mincount && last_str) {
4804 SV * const sv = data->last_found;
4805 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4806 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4810 sv_setsv(sv, last_str);
4811 data->last_end = data->pos_min;
4812 data->last_start_min = data->pos_min - last_chrs;
4813 data->last_start_max = is_inf
4815 : data->pos_min + data->pos_delta - last_chrs;
4817 data->longest = &(data->longest_float);
4819 SvREFCNT_dec(last_str);
4821 if (data && (fl & SF_HAS_EVAL))
4822 data->flags |= SF_HAS_EVAL;
4823 optimize_curly_tail:
4824 if (OP(oscan) != CURLYX) {
4825 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4827 NEXT_OFF(oscan) += NEXT_OFF(next);
4833 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4838 if (flags & SCF_DO_SUBSTR) {
4839 /* Cannot expect anything... */
4840 scan_commit(pRExC_state, data, minlenp, is_inf);
4841 data->longest = &(data->longest_float);
4843 is_inf = is_inf_internal = 1;
4844 if (flags & SCF_DO_STCLASS_OR) {
4845 if (OP(scan) == CLUMP) {
4846 /* Actually is any start char, but very few code points
4847 * aren't start characters */
4848 ssc_match_all_cp(data->start_class);
4851 ssc_anything(data->start_class);
4854 flags &= ~SCF_DO_STCLASS;
4858 else if (OP(scan) == LNBREAK) {
4859 if (flags & SCF_DO_STCLASS) {
4860 if (flags & SCF_DO_STCLASS_AND) {
4861 ssc_intersection(data->start_class,
4862 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4863 ssc_clear_locale(data->start_class);
4864 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4866 else if (flags & SCF_DO_STCLASS_OR) {
4867 ssc_union(data->start_class,
4868 PL_XPosix_ptrs[_CC_VERTSPACE],
4870 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4872 /* See commit msg for
4873 * 749e076fceedeb708a624933726e7989f2302f6a */
4874 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4876 flags &= ~SCF_DO_STCLASS;
4879 delta++; /* Because of the 2 char string cr-lf */
4880 if (flags & SCF_DO_SUBSTR) {
4881 /* Cannot expect anything... */
4882 scan_commit(pRExC_state, data, minlenp, is_inf);
4884 data->pos_delta += 1;
4885 data->longest = &(data->longest_float);
4888 else if (REGNODE_SIMPLE(OP(scan))) {
4890 if (flags & SCF_DO_SUBSTR) {
4891 scan_commit(pRExC_state, data, minlenp, is_inf);
4895 if (flags & SCF_DO_STCLASS) {
4897 SV* my_invlist = sv_2mortal(_new_invlist(0));
4900 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4901 ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
4903 /* Some of the logic below assumes that switching
4904 locale on will only add false positives. */
4909 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4914 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4915 ssc_match_all_cp(data->start_class);
4920 SV* REG_ANY_invlist = _new_invlist(2);
4921 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4923 if (flags & SCF_DO_STCLASS_OR) {
4924 ssc_union(data->start_class,
4926 TRUE /* TRUE => invert, hence all but \n
4930 else if (flags & SCF_DO_STCLASS_AND) {
4931 ssc_intersection(data->start_class,
4933 TRUE /* TRUE => invert */
4935 ssc_clear_locale(data->start_class);
4937 SvREFCNT_dec_NN(REG_ANY_invlist);
4942 if (flags & SCF_DO_STCLASS_AND)
4943 ssc_and(pRExC_state, data->start_class,
4944 (regnode_charclass *) scan);
4946 ssc_or(pRExC_state, data->start_class,
4947 (regnode_charclass *) scan);
4955 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4956 if (flags & SCF_DO_STCLASS_AND) {
4957 bool was_there = cBOOL(
4958 ANYOF_POSIXL_TEST(data->start_class,
4960 ANYOF_POSIXL_ZERO(data->start_class);
4961 if (was_there) { /* Do an AND */
4962 ANYOF_POSIXL_SET(data->start_class, namedclass);
4964 /* No individual code points can now match */
4965 data->start_class->invlist
4966 = sv_2mortal(_new_invlist(0));
4969 int complement = namedclass + ((invert) ? -1 : 1);
4971 assert(flags & SCF_DO_STCLASS_OR);
4973 /* If the complement of this class was already there,
4974 * the result is that they match all code points,
4975 * (\d + \D == everything). Remove the classes from
4976 * future consideration. Locale is not relevant in
4978 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4979 ssc_match_all_cp(data->start_class);
4980 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4981 ANYOF_POSIXL_CLEAR(data->start_class, complement);
4983 else { /* The usual case; just add this class to the
4985 ANYOF_POSIXL_SET(data->start_class, namedclass);
4990 case NPOSIXA: /* For these, we always know the exact set of
4995 if (FLAGS(scan) == _CC_ASCII) {
4996 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
4999 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5000 PL_XPosix_ptrs[_CC_ASCII],
5011 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5013 /* NPOSIXD matches all upper Latin1 code points unless the
5014 * target string being matched is UTF-8, which is
5015 * unknowable until match time. Since we are going to
5016 * invert, we want to get rid of all of them so that the
5017 * inversion will match all */
5018 if (OP(scan) == NPOSIXD) {
5019 _invlist_subtract(my_invlist, PL_UpperLatin1,
5025 if (flags & SCF_DO_STCLASS_AND) {
5026 ssc_intersection(data->start_class, my_invlist, invert);
5027 ssc_clear_locale(data->start_class);
5030 assert(flags & SCF_DO_STCLASS_OR);
5031 ssc_union(data->start_class, my_invlist, invert);
5034 if (flags & SCF_DO_STCLASS_OR)
5035 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5036 flags &= ~SCF_DO_STCLASS;
5039 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5040 data->flags |= (OP(scan) == MEOL
5043 scan_commit(pRExC_state, data, minlenp, is_inf);
5046 else if ( PL_regkind[OP(scan)] == BRANCHJ
5047 /* Lookbehind, or need to calculate parens/evals/stclass: */
5048 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5049 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
5050 if ( OP(scan) == UNLESSM &&
5052 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5053 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5056 regnode *upto= regnext(scan);
5058 SV * const mysv_val=sv_newmortal();
5059 DEBUG_STUDYDATA("OPFAIL",data,depth);
5061 /*DEBUG_PARSE_MSG("opfail");*/
5062 regprop(RExC_rx, mysv_val, upto, NULL);
5063 PerlIO_printf(Perl_debug_log,
5064 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5065 SvPV_nolen_const(mysv_val),
5066 (IV)REG_NODE_NUM(upto),
5071 NEXT_OFF(scan) = upto - scan;
5072 for (opt= scan + 1; opt < upto ; opt++)
5073 OP(opt) = OPTIMIZED;
5077 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5078 || OP(scan) == UNLESSM )
5080 /* Negative Lookahead/lookbehind
5081 In this case we can't do fixed string optimisation.
5084 SSize_t deltanext, minnext, fake = 0;
5089 data_fake.flags = 0;
5091 data_fake.whilem_c = data->whilem_c;
5092 data_fake.last_closep = data->last_closep;
5095 data_fake.last_closep = &fake;
5096 data_fake.pos_delta = delta;
5097 if ( flags & SCF_DO_STCLASS && !scan->flags
5098 && OP(scan) == IFMATCH ) { /* Lookahead */
5099 ssc_init(pRExC_state, &intrnl);
5100 data_fake.start_class = &intrnl;
5101 f |= SCF_DO_STCLASS_AND;
5103 if (flags & SCF_WHILEM_VISITED_POS)
5104 f |= SCF_WHILEM_VISITED_POS;
5105 next = regnext(scan);
5106 nscan = NEXTOPER(NEXTOPER(scan));
5107 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5108 last, &data_fake, stopparen,
5109 recursed_depth, NULL, f, depth+1);
5112 FAIL("Variable length lookbehind not implemented");
5114 else if (minnext > (I32)U8_MAX) {
5115 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5118 scan->flags = (U8)minnext;
5121 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5123 if (data_fake.flags & SF_HAS_EVAL)
5124 data->flags |= SF_HAS_EVAL;
5125 data->whilem_c = data_fake.whilem_c;
5127 if (f & SCF_DO_STCLASS_AND) {
5128 if (flags & SCF_DO_STCLASS_OR) {
5129 /* OR before, AND after: ideally we would recurse with
5130 * data_fake to get the AND applied by study of the
5131 * remainder of the pattern, and then derecurse;
5132 * *** HACK *** for now just treat as "no information".
5133 * See [perl #56690].
5135 ssc_init(pRExC_state, data->start_class);
5137 /* AND before and after: combine and continue */
5138 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5142 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5144 /* Positive Lookahead/lookbehind
5145 In this case we can do fixed string optimisation,
5146 but we must be careful about it. Note in the case of
5147 lookbehind the positions will be offset by the minimum
5148 length of the pattern, something we won't know about
5149 until after the recurse.
5151 SSize_t deltanext, fake = 0;
5155 /* We use SAVEFREEPV so that when the full compile
5156 is finished perl will clean up the allocated
5157 minlens when it's all done. This way we don't
5158 have to worry about freeing them when we know
5159 they wont be used, which would be a pain.
5162 Newx( minnextp, 1, SSize_t );
5163 SAVEFREEPV(minnextp);
5166 StructCopy(data, &data_fake, scan_data_t);
5167 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5170 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5171 data_fake.last_found=newSVsv(data->last_found);
5175 data_fake.last_closep = &fake;
5176 data_fake.flags = 0;
5177 data_fake.pos_delta = delta;
5179 data_fake.flags |= SF_IS_INF;
5180 if ( flags & SCF_DO_STCLASS && !scan->flags
5181 && OP(scan) == IFMATCH ) { /* Lookahead */
5182 ssc_init(pRExC_state, &intrnl);
5183 data_fake.start_class = &intrnl;
5184 f |= SCF_DO_STCLASS_AND;
5186 if (flags & SCF_WHILEM_VISITED_POS)
5187 f |= SCF_WHILEM_VISITED_POS;
5188 next = regnext(scan);
5189 nscan = NEXTOPER(NEXTOPER(scan));
5191 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5192 &deltanext, last, &data_fake,
5193 stopparen, recursed_depth, NULL,
5197 FAIL("Variable length lookbehind not implemented");
5199 else if (*minnextp > (I32)U8_MAX) {
5200 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5203 scan->flags = (U8)*minnextp;
5208 if (f & SCF_DO_STCLASS_AND) {
5209 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5212 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5214 if (data_fake.flags & SF_HAS_EVAL)
5215 data->flags |= SF_HAS_EVAL;
5216 data->whilem_c = data_fake.whilem_c;
5217 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5218 if (RExC_rx->minlen<*minnextp)
5219 RExC_rx->minlen=*minnextp;
5220 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5221 SvREFCNT_dec_NN(data_fake.last_found);
5223 if ( data_fake.minlen_fixed != minlenp )
5225 data->offset_fixed= data_fake.offset_fixed;
5226 data->minlen_fixed= data_fake.minlen_fixed;
5227 data->lookbehind_fixed+= scan->flags;
5229 if ( data_fake.minlen_float != minlenp )
5231 data->minlen_float= data_fake.minlen_float;
5232 data->offset_float_min=data_fake.offset_float_min;
5233 data->offset_float_max=data_fake.offset_float_max;
5234 data->lookbehind_float+= scan->flags;
5241 else if (OP(scan) == OPEN) {
5242 if (stopparen != (I32)ARG(scan))
5245 else if (OP(scan) == CLOSE) {
5246 if (stopparen == (I32)ARG(scan)) {
5249 if ((I32)ARG(scan) == is_par) {
5250 next = regnext(scan);
5252 if ( next && (OP(next) != WHILEM) && next < last)
5253 is_par = 0; /* Disable optimization */
5256 *(data->last_closep) = ARG(scan);
5258 else if (OP(scan) == EVAL) {
5260 data->flags |= SF_HAS_EVAL;
5262 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5263 if (flags & SCF_DO_SUBSTR) {
5264 scan_commit(pRExC_state, data, minlenp, is_inf);
5265 flags &= ~SCF_DO_SUBSTR;
5267 if (data && OP(scan)==ACCEPT) {
5268 data->flags |= SCF_SEEN_ACCEPT;
5273 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5275 if (flags & SCF_DO_SUBSTR) {
5276 scan_commit(pRExC_state, data, minlenp, is_inf);
5277 data->longest = &(data->longest_float);
5279 is_inf = is_inf_internal = 1;
5280 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5281 ssc_anything(data->start_class);
5282 flags &= ~SCF_DO_STCLASS;
5284 else if (OP(scan) == GPOS) {
5285 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5286 !(delta || is_inf || (data && data->pos_delta)))
5288 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5289 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5290 if (RExC_rx->gofs < (STRLEN)min)
5291 RExC_rx->gofs = min;
5293 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5297 #ifdef TRIE_STUDY_OPT
5298 #ifdef FULL_TRIE_STUDY
5299 else if (PL_regkind[OP(scan)] == TRIE) {
5300 /* NOTE - There is similar code to this block above for handling
5301 BRANCH nodes on the initial study. If you change stuff here
5303 regnode *trie_node= scan;
5304 regnode *tail= regnext(scan);
5305 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5306 SSize_t max1 = 0, min1 = SSize_t_MAX;
5309 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5310 /* Cannot merge strings after this. */
5311 scan_commit(pRExC_state, data, minlenp, is_inf);
5313 if (flags & SCF_DO_STCLASS)
5314 ssc_init_zero(pRExC_state, &accum);
5320 const regnode *nextbranch= NULL;
5323 for ( word=1 ; word <= trie->wordcount ; word++)
5325 SSize_t deltanext=0, minnext=0, f = 0, fake;
5326 regnode_ssc this_class;
5328 data_fake.flags = 0;
5330 data_fake.whilem_c = data->whilem_c;
5331 data_fake.last_closep = data->last_closep;
5334 data_fake.last_closep = &fake;
5335 data_fake.pos_delta = delta;
5336 if (flags & SCF_DO_STCLASS) {
5337 ssc_init(pRExC_state, &this_class);
5338 data_fake.start_class = &this_class;
5339 f = SCF_DO_STCLASS_AND;
5341 if (flags & SCF_WHILEM_VISITED_POS)
5342 f |= SCF_WHILEM_VISITED_POS;
5344 if (trie->jump[word]) {
5346 nextbranch = trie_node + trie->jump[0];
5347 scan= trie_node + trie->jump[word];
5348 /* We go from the jump point to the branch that follows
5349 it. Note this means we need the vestigal unused
5350 branches even though they arent otherwise used. */
5351 minnext = study_chunk(pRExC_state, &scan, minlenp,
5352 &deltanext, (regnode *)nextbranch, &data_fake,
5353 stopparen, recursed_depth, NULL, f,depth+1);
5355 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5356 nextbranch= regnext((regnode*)nextbranch);
5358 if (min1 > (SSize_t)(minnext + trie->minlen))
5359 min1 = minnext + trie->minlen;
5360 if (deltanext == SSize_t_MAX) {
5361 is_inf = is_inf_internal = 1;
5363 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5364 max1 = minnext + deltanext + trie->maxlen;
5366 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5368 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5369 if ( stopmin > min + min1)
5370 stopmin = min + min1;
5371 flags &= ~SCF_DO_SUBSTR;
5373 data->flags |= SCF_SEEN_ACCEPT;
5376 if (data_fake.flags & SF_HAS_EVAL)
5377 data->flags |= SF_HAS_EVAL;
5378 data->whilem_c = data_fake.whilem_c;
5380 if (flags & SCF_DO_STCLASS)
5381 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5384 if (flags & SCF_DO_SUBSTR) {
5385 data->pos_min += min1;
5386 data->pos_delta += max1 - min1;
5387 if (max1 != min1 || is_inf)
5388 data->longest = &(data->longest_float);
5391 delta += max1 - min1;
5392 if (flags & SCF_DO_STCLASS_OR) {
5393 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5395 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5396 flags &= ~SCF_DO_STCLASS;
5399 else if (flags & SCF_DO_STCLASS_AND) {
5401 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5402 flags &= ~SCF_DO_STCLASS;
5405 /* Switch to OR mode: cache the old value of
5406 * data->start_class */
5408 StructCopy(data->start_class, and_withp, regnode_ssc);
5409 flags &= ~SCF_DO_STCLASS_AND;
5410 StructCopy(&accum, data->start_class, regnode_ssc);
5411 flags |= SCF_DO_STCLASS_OR;
5418 else if (PL_regkind[OP(scan)] == TRIE) {
5419 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5422 min += trie->minlen;
5423 delta += (trie->maxlen - trie->minlen);
5424 flags &= ~SCF_DO_STCLASS; /* xxx */
5425 if (flags & SCF_DO_SUBSTR) {
5426 /* Cannot expect anything... */
5427 scan_commit(pRExC_state, data, minlenp, is_inf);
5428 data->pos_min += trie->minlen;
5429 data->pos_delta += (trie->maxlen - trie->minlen);
5430 if (trie->maxlen != trie->minlen)
5431 data->longest = &(data->longest_float);
5433 if (trie->jump) /* no more substrings -- for now /grr*/
5434 flags &= ~SCF_DO_SUBSTR;
5436 #endif /* old or new */
5437 #endif /* TRIE_STUDY_OPT */
5439 /* Else: zero-length, ignore. */
5440 scan = regnext(scan);
5442 /* If we are exiting a recursion we can unset its recursed bit
5443 * and allow ourselves to enter it again - no danger of an
5444 * infinite loop there.
5445 if (stopparen > -1 && recursed) {
5446 DEBUG_STUDYDATA("unset:", data,depth);
5447 PAREN_UNSET( recursed, stopparen);
5451 DEBUG_STUDYDATA("frame-end:",data,depth);
5452 DEBUG_PEEP("fend", scan, depth);
5453 /* restore previous context */
5456 stopparen = frame->stop;
5457 recursed_depth = frame->prev_recursed_depth;
5460 frame = frame->prev;
5461 goto fake_study_recurse;
5466 DEBUG_STUDYDATA("pre-fin:",data,depth);
5469 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5471 if (flags & SCF_DO_SUBSTR && is_inf)
5472 data->pos_delta = SSize_t_MAX - data->pos_min;
5473 if (is_par > (I32)U8_MAX)
5475 if (is_par && pars==1 && data) {
5476 data->flags |= SF_IN_PAR;
5477 data->flags &= ~SF_HAS_PAR;
5479 else if (pars && data) {
5480 data->flags |= SF_HAS_PAR;
5481 data->flags &= ~SF_IN_PAR;
5483 if (flags & SCF_DO_STCLASS_OR)
5484 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5485 if (flags & SCF_TRIE_RESTUDY)
5486 data->flags |= SCF_TRIE_RESTUDY;
5488 DEBUG_STUDYDATA("post-fin:",data,depth);
5491 SSize_t final_minlen= min < stopmin ? min : stopmin;
5493 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5494 RExC_maxlen = final_minlen + delta;
5496 return final_minlen;
5502 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5504 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5506 PERL_ARGS_ASSERT_ADD_DATA;
5508 Renewc(RExC_rxi->data,
5509 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5510 char, struct reg_data);
5512 Renew(RExC_rxi->data->what, count + n, U8);
5514 Newx(RExC_rxi->data->what, n, U8);
5515 RExC_rxi->data->count = count + n;
5516 Copy(s, RExC_rxi->data->what + count, n, U8);
5520 /*XXX: todo make this not included in a non debugging perl, but appears to be
5521 * used anyway there, in 'use re' */
5522 #ifndef PERL_IN_XSUB_RE
5524 Perl_reginitcolors(pTHX)
5527 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5529 char *t = savepv(s);
5533 t = strchr(t, '\t');
5539 PL_colors[i] = t = (char *)"";
5544 PL_colors[i++] = (char *)"";
5551 #ifdef TRIE_STUDY_OPT
5552 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5555 (data.flags & SCF_TRIE_RESTUDY) \
5563 #define CHECK_RESTUDY_GOTO_butfirst
5567 * pregcomp - compile a regular expression into internal code
5569 * Decides which engine's compiler to call based on the hint currently in
5573 #ifndef PERL_IN_XSUB_RE
5575 /* return the currently in-scope regex engine (or the default if none) */
5577 regexp_engine const *
5578 Perl_current_re_engine(pTHX)
5582 if (IN_PERL_COMPILETIME) {
5583 HV * const table = GvHV(PL_hintgv);
5586 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5587 return &reh_regexp_engine;
5588 ptr = hv_fetchs(table, "regcomp", FALSE);
5589 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5590 return &reh_regexp_engine;
5591 return INT2PTR(regexp_engine*,SvIV(*ptr));
5595 if (!PL_curcop->cop_hints_hash)
5596 return &reh_regexp_engine;
5597 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5598 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5599 return &reh_regexp_engine;
5600 return INT2PTR(regexp_engine*,SvIV(ptr));
5606 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5609 regexp_engine const *eng = current_re_engine();
5610 GET_RE_DEBUG_FLAGS_DECL;
5612 PERL_ARGS_ASSERT_PREGCOMP;
5614 /* Dispatch a request to compile a regexp to correct regexp engine. */
5616 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5619 return CALLREGCOMP_ENG(eng, pattern, flags);
5623 /* public(ish) entry point for the perl core's own regex compiling code.
5624 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5625 * pattern rather than a list of OPs, and uses the internal engine rather
5626 * than the current one */
5629 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5631 SV *pat = pattern; /* defeat constness! */
5632 PERL_ARGS_ASSERT_RE_COMPILE;
5633 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5634 #ifdef PERL_IN_XSUB_RE
5639 NULL, NULL, rx_flags, 0);
5643 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5644 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5645 * point to the realloced string and length.
5647 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5651 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5652 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5654 U8 *const src = (U8*)*pat_p;
5657 STRLEN s = 0, d = 0;
5659 GET_RE_DEBUG_FLAGS_DECL;
5661 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5662 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5664 Newx(dst, *plen_p * 2 + 1, U8);
5666 while (s < *plen_p) {
5667 if (NATIVE_BYTE_IS_INVARIANT(src[s]))
5670 dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
5671 dst[d] = UTF8_EIGHT_BIT_LO(src[s]);
5673 if (n < num_code_blocks) {
5674 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5675 pRExC_state->code_blocks[n].start = d;
5676 assert(dst[d] == '(');
5679 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5680 pRExC_state->code_blocks[n].end = d;
5681 assert(dst[d] == ')');
5691 *pat_p = (char*) dst;
5693 RExC_orig_utf8 = RExC_utf8 = 1;
5698 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5699 * while recording any code block indices, and handling overloading,
5700 * nested qr// objects etc. If pat is null, it will allocate a new
5701 * string, or just return the first arg, if there's only one.
5703 * Returns the malloced/updated pat.
5704 * patternp and pat_count is the array of SVs to be concatted;
5705 * oplist is the optional list of ops that generated the SVs;
5706 * recompile_p is a pointer to a boolean that will be set if
5707 * the regex will need to be recompiled.
5708 * delim, if non-null is an SV that will be inserted between each element
5712 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5713 SV *pat, SV ** const patternp, int pat_count,
5714 OP *oplist, bool *recompile_p, SV *delim)
5718 bool use_delim = FALSE;
5719 bool alloced = FALSE;
5721 /* if we know we have at least two args, create an empty string,
5722 * then concatenate args to that. For no args, return an empty string */
5723 if (!pat && pat_count != 1) {
5729 for (svp = patternp; svp < patternp + pat_count; svp++) {
5732 STRLEN orig_patlen = 0;
5734 SV *msv = use_delim ? delim : *svp;
5735 if (!msv) msv = &PL_sv_undef;
5737 /* if we've got a delimiter, we go round the loop twice for each
5738 * svp slot (except the last), using the delimiter the second
5747 if (SvTYPE(msv) == SVt_PVAV) {
5748 /* we've encountered an interpolated array within
5749 * the pattern, e.g. /...@a..../. Expand the list of elements,
5750 * then recursively append elements.
5751 * The code in this block is based on S_pushav() */
5753 AV *const av = (AV*)msv;
5754 const SSize_t maxarg = AvFILL(av) + 1;
5758 assert(oplist->op_type == OP_PADAV
5759 || oplist->op_type == OP_RV2AV);
5760 oplist = oplist->op_sibling;;
5763 if (SvRMAGICAL(av)) {
5766 Newx(array, maxarg, SV*);
5768 for (i=0; i < maxarg; i++) {
5769 SV ** const svp = av_fetch(av, i, FALSE);
5770 array[i] = svp ? *svp : &PL_sv_undef;
5774 array = AvARRAY(av);
5776 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5777 array, maxarg, NULL, recompile_p,
5779 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5785 /* we make the assumption here that each op in the list of
5786 * op_siblings maps to one SV pushed onto the stack,
5787 * except for code blocks, with have both an OP_NULL and
5789 * This allows us to match up the list of SVs against the
5790 * list of OPs to find the next code block.
5792 * Note that PUSHMARK PADSV PADSV ..
5794 * PADRANGE PADSV PADSV ..
5795 * so the alignment still works. */
5798 if (oplist->op_type == OP_NULL
5799 && (oplist->op_flags & OPf_SPECIAL))
5801 assert(n < pRExC_state->num_code_blocks);
5802 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5803 pRExC_state->code_blocks[n].block = oplist;
5804 pRExC_state->code_blocks[n].src_regex = NULL;
5807 oplist = oplist->op_sibling; /* skip CONST */
5810 oplist = oplist->op_sibling;;
5813 /* apply magic and QR overloading to arg */
5816 if (SvROK(msv) && SvAMAGIC(msv)) {
5817 SV *sv = AMG_CALLunary(msv, regexp_amg);
5821 if (SvTYPE(sv) != SVt_REGEXP)
5822 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5827 /* try concatenation overload ... */
5828 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5829 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5832 /* overloading involved: all bets are off over literal
5833 * code. Pretend we haven't seen it */
5834 pRExC_state->num_code_blocks -= n;
5838 /* ... or failing that, try "" overload */
5839 while (SvAMAGIC(msv)
5840 && (sv = AMG_CALLunary(msv, string_amg))
5844 && SvRV(msv) == SvRV(sv))
5849 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5853 /* this is a partially unrolled
5854 * sv_catsv_nomg(pat, msv);
5855 * that allows us to adjust code block indices if
5858 char *dst = SvPV_force_nomg(pat, dlen);
5860 if (SvUTF8(msv) && !SvUTF8(pat)) {
5861 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5862 sv_setpvn(pat, dst, dlen);
5865 sv_catsv_nomg(pat, msv);
5872 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5875 /* extract any code blocks within any embedded qr//'s */
5876 if (rx && SvTYPE(rx) == SVt_REGEXP
5877 && RX_ENGINE((REGEXP*)rx)->op_comp)
5880 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5881 if (ri->num_code_blocks) {
5883 /* the presence of an embedded qr// with code means
5884 * we should always recompile: the text of the
5885 * qr// may not have changed, but it may be a
5886 * different closure than last time */
5888 Renew(pRExC_state->code_blocks,
5889 pRExC_state->num_code_blocks + ri->num_code_blocks,
5890 struct reg_code_block);
5891 pRExC_state->num_code_blocks += ri->num_code_blocks;
5893 for (i=0; i < ri->num_code_blocks; i++) {
5894 struct reg_code_block *src, *dst;
5895 STRLEN offset = orig_patlen
5896 + ReANY((REGEXP *)rx)->pre_prefix;
5897 assert(n < pRExC_state->num_code_blocks);
5898 src = &ri->code_blocks[i];
5899 dst = &pRExC_state->code_blocks[n];
5900 dst->start = src->start + offset;
5901 dst->end = src->end + offset;
5902 dst->block = src->block;
5903 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5912 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5921 /* see if there are any run-time code blocks in the pattern.
5922 * False positives are allowed */
5925 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5926 char *pat, STRLEN plen)
5931 PERL_UNUSED_CONTEXT;
5933 for (s = 0; s < plen; s++) {
5934 if (n < pRExC_state->num_code_blocks
5935 && s == pRExC_state->code_blocks[n].start)
5937 s = pRExC_state->code_blocks[n].end;
5941 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5943 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5945 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5952 /* Handle run-time code blocks. We will already have compiled any direct
5953 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5954 * copy of it, but with any literal code blocks blanked out and
5955 * appropriate chars escaped; then feed it into
5957 * eval "qr'modified_pattern'"
5961 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5965 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5967 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5968 * and merge them with any code blocks of the original regexp.
5970 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5971 * instead, just save the qr and return FALSE; this tells our caller that
5972 * the original pattern needs upgrading to utf8.
5976 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5977 char *pat, STRLEN plen)
5981 GET_RE_DEBUG_FLAGS_DECL;
5983 if (pRExC_state->runtime_code_qr) {
5984 /* this is the second time we've been called; this should
5985 * only happen if the main pattern got upgraded to utf8
5986 * during compilation; re-use the qr we compiled first time
5987 * round (which should be utf8 too)
5989 qr = pRExC_state->runtime_code_qr;
5990 pRExC_state->runtime_code_qr = NULL;
5991 assert(RExC_utf8 && SvUTF8(qr));
5997 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6001 /* determine how many extra chars we need for ' and \ escaping */
6002 for (s = 0; s < plen; s++) {
6003 if (pat[s] == '\'' || pat[s] == '\\')
6007 Newx(newpat, newlen, char);
6009 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6011 for (s = 0; s < plen; s++) {
6012 if (n < pRExC_state->num_code_blocks
6013 && s == pRExC_state->code_blocks[n].start)
6015 /* blank out literal code block */
6016 assert(pat[s] == '(');
6017 while (s <= pRExC_state->code_blocks[n].end) {
6025 if (pat[s] == '\'' || pat[s] == '\\')
6030 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6034 PerlIO_printf(Perl_debug_log,
6035 "%sre-parsing pattern for runtime code:%s %s\n",
6036 PL_colors[4],PL_colors[5],newpat);
6039 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6045 PUSHSTACKi(PERLSI_REQUIRE);
6046 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6047 * parsing qr''; normally only q'' does this. It also alters
6049 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6050 SvREFCNT_dec_NN(sv);
6055 SV * const errsv = ERRSV;
6056 if (SvTRUE_NN(errsv))
6058 Safefree(pRExC_state->code_blocks);
6059 /* use croak_sv ? */
6060 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6063 assert(SvROK(qr_ref));
6065 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6066 /* the leaving below frees the tmp qr_ref.
6067 * Give qr a life of its own */
6075 if (!RExC_utf8 && SvUTF8(qr)) {
6076 /* first time through; the pattern got upgraded; save the
6077 * qr for the next time through */
6078 assert(!pRExC_state->runtime_code_qr);
6079 pRExC_state->runtime_code_qr = qr;
6084 /* extract any code blocks within the returned qr// */
6087 /* merge the main (r1) and run-time (r2) code blocks into one */
6089 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6090 struct reg_code_block *new_block, *dst;
6091 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6094 if (!r2->num_code_blocks) /* we guessed wrong */
6096 SvREFCNT_dec_NN(qr);
6101 r1->num_code_blocks + r2->num_code_blocks,
6102 struct reg_code_block);
6105 while ( i1 < r1->num_code_blocks
6106 || i2 < r2->num_code_blocks)
6108 struct reg_code_block *src;
6111 if (i1 == r1->num_code_blocks) {
6112 src = &r2->code_blocks[i2++];
6115 else if (i2 == r2->num_code_blocks)
6116 src = &r1->code_blocks[i1++];
6117 else if ( r1->code_blocks[i1].start
6118 < r2->code_blocks[i2].start)
6120 src = &r1->code_blocks[i1++];
6121 assert(src->end < r2->code_blocks[i2].start);
6124 assert( r1->code_blocks[i1].start
6125 > r2->code_blocks[i2].start);
6126 src = &r2->code_blocks[i2++];
6128 assert(src->end < r1->code_blocks[i1].start);
6131 assert(pat[src->start] == '(');
6132 assert(pat[src->end] == ')');
6133 dst->start = src->start;
6134 dst->end = src->end;
6135 dst->block = src->block;
6136 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6140 r1->num_code_blocks += r2->num_code_blocks;
6141 Safefree(r1->code_blocks);
6142 r1->code_blocks = new_block;
6145 SvREFCNT_dec_NN(qr);
6151 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6152 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6153 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6154 STRLEN longest_length, bool eol, bool meol)
6156 /* This is the common code for setting up the floating and fixed length
6157 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6158 * as to whether succeeded or not */
6163 if (! (longest_length
6164 || (eol /* Can't have SEOL and MULTI */
6165 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6167 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6168 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6173 /* copy the information about the longest from the reg_scan_data
6174 over to the program. */
6175 if (SvUTF8(sv_longest)) {
6176 *rx_utf8 = sv_longest;
6179 *rx_substr = sv_longest;
6182 /* end_shift is how many chars that must be matched that
6183 follow this item. We calculate it ahead of time as once the
6184 lookbehind offset is added in we lose the ability to correctly
6186 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6187 *rx_end_shift = ml - offset
6188 - longest_length + (SvTAIL(sv_longest) != 0)
6191 t = (eol/* Can't have SEOL and MULTI */
6192 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6193 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6199 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6200 * regular expression into internal code.
6201 * The pattern may be passed either as:
6202 * a list of SVs (patternp plus pat_count)
6203 * a list of OPs (expr)
6204 * If both are passed, the SV list is used, but the OP list indicates
6205 * which SVs are actually pre-compiled code blocks
6207 * The SVs in the list have magic and qr overloading applied to them (and
6208 * the list may be modified in-place with replacement SVs in the latter
6211 * If the pattern hasn't changed from old_re, then old_re will be
6214 * eng is the current engine. If that engine has an op_comp method, then
6215 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6216 * do the initial concatenation of arguments and pass on to the external
6219 * If is_bare_re is not null, set it to a boolean indicating whether the
6220 * arg list reduced (after overloading) to a single bare regex which has
6221 * been returned (i.e. /$qr/).
6223 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6225 * pm_flags contains the PMf_* flags, typically based on those from the
6226 * pm_flags field of the related PMOP. Currently we're only interested in
6227 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6229 * We can't allocate space until we know how big the compiled form will be,
6230 * but we can't compile it (and thus know how big it is) until we've got a
6231 * place to put the code. So we cheat: we compile it twice, once with code
6232 * generation turned off and size counting turned on, and once "for real".
6233 * This also means that we don't allocate space until we are sure that the
6234 * thing really will compile successfully, and we never have to move the
6235 * code and thus invalidate pointers into it. (Note that it has to be in
6236 * one piece because free() must be able to free it all.) [NB: not true in perl]
6238 * Beware that the optimization-preparation code in here knows about some
6239 * of the structure of the compiled regexp. [I'll say.]
6243 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6244 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6245 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6250 regexp_internal *ri;
6258 SV *code_blocksv = NULL;
6259 SV** new_patternp = patternp;
6261 /* these are all flags - maybe they should be turned
6262 * into a single int with different bit masks */
6263 I32 sawlookahead = 0;
6268 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6270 bool runtime_code = 0;
6272 RExC_state_t RExC_state;
6273 RExC_state_t * const pRExC_state = &RExC_state;
6274 #ifdef TRIE_STUDY_OPT
6276 RExC_state_t copyRExC_state;
6278 GET_RE_DEBUG_FLAGS_DECL;
6280 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6282 DEBUG_r(if (!PL_colorset) reginitcolors());
6284 #ifndef PERL_IN_XSUB_RE
6285 /* Initialize these here instead of as-needed, as is quick and avoids
6286 * having to test them each time otherwise */
6287 if (! PL_AboveLatin1) {
6288 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6289 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6290 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6291 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6292 PL_HasMultiCharFold =
6293 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6297 pRExC_state->code_blocks = NULL;
6298 pRExC_state->num_code_blocks = 0;
6301 *is_bare_re = FALSE;
6303 if (expr && (expr->op_type == OP_LIST ||
6304 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6305 /* allocate code_blocks if needed */
6309 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
6310 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6311 ncode++; /* count of DO blocks */
6313 pRExC_state->num_code_blocks = ncode;
6314 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6319 /* compile-time pattern with just OP_CONSTs and DO blocks */
6324 /* find how many CONSTs there are */
6327 if (expr->op_type == OP_CONST)
6330 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6331 if (o->op_type == OP_CONST)
6335 /* fake up an SV array */
6337 assert(!new_patternp);
6338 Newx(new_patternp, n, SV*);
6339 SAVEFREEPV(new_patternp);
6343 if (expr->op_type == OP_CONST)
6344 new_patternp[n] = cSVOPx_sv(expr);
6346 for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
6347 if (o->op_type == OP_CONST)
6348 new_patternp[n++] = cSVOPo_sv;
6353 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6354 "Assembling pattern from %d elements%s\n", pat_count,
6355 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6357 /* set expr to the first arg op */
6359 if (pRExC_state->num_code_blocks
6360 && expr->op_type != OP_CONST)
6362 expr = cLISTOPx(expr)->op_first;
6363 assert( expr->op_type == OP_PUSHMARK
6364 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6365 || expr->op_type == OP_PADRANGE);
6366 expr = expr->op_sibling;
6369 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6370 expr, &recompile, NULL);
6372 /* handle bare (possibly after overloading) regex: foo =~ $re */
6377 if (SvTYPE(re) == SVt_REGEXP) {
6381 Safefree(pRExC_state->code_blocks);
6382 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6383 "Precompiled pattern%s\n",
6384 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6390 exp = SvPV_nomg(pat, plen);
6392 if (!eng->op_comp) {
6393 if ((SvUTF8(pat) && IN_BYTES)
6394 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6396 /* make a temporary copy; either to convert to bytes,
6397 * or to avoid repeating get-magic / overloaded stringify */
6398 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6399 (IN_BYTES ? 0 : SvUTF8(pat)));
6401 Safefree(pRExC_state->code_blocks);
6402 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6405 /* ignore the utf8ness if the pattern is 0 length */
6406 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6407 RExC_uni_semantics = 0;
6408 RExC_contains_locale = 0;
6409 RExC_contains_i = 0;
6410 pRExC_state->runtime_code_qr = NULL;
6413 SV *dsv= sv_newmortal();
6414 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6415 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6416 PL_colors[4],PL_colors[5],s);
6420 /* we jump here if we upgrade the pattern to utf8 and have to
6423 if ((pm_flags & PMf_USE_RE_EVAL)
6424 /* this second condition covers the non-regex literal case,
6425 * i.e. $foo =~ '(?{})'. */
6426 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6428 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6430 /* return old regex if pattern hasn't changed */
6431 /* XXX: note in the below we have to check the flags as well as the
6434 * Things get a touch tricky as we have to compare the utf8 flag
6435 * independently from the compile flags. */
6439 && !!RX_UTF8(old_re) == !!RExC_utf8
6440 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6441 && RX_PRECOMP(old_re)
6442 && RX_PRELEN(old_re) == plen
6443 && memEQ(RX_PRECOMP(old_re), exp, plen)
6444 && !runtime_code /* with runtime code, always recompile */ )
6446 Safefree(pRExC_state->code_blocks);
6450 rx_flags = orig_rx_flags;
6452 if (rx_flags & PMf_FOLD) {
6453 RExC_contains_i = 1;
6455 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6457 /* Set to use unicode semantics if the pattern is in utf8 and has the
6458 * 'depends' charset specified, as it means unicode when utf8 */
6459 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6463 RExC_flags = rx_flags;
6464 RExC_pm_flags = pm_flags;
6467 if (TAINTING_get && TAINT_get)
6468 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6470 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6471 /* whoops, we have a non-utf8 pattern, whilst run-time code
6472 * got compiled as utf8. Try again with a utf8 pattern */
6473 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6474 pRExC_state->num_code_blocks);
6475 goto redo_first_pass;
6478 assert(!pRExC_state->runtime_code_qr);
6484 RExC_in_lookbehind = 0;
6485 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6487 RExC_override_recoding = 0;
6488 RExC_in_multi_char_class = 0;
6490 /* First pass: determine size, legality. */
6493 RExC_end = exp + plen;
6498 RExC_emit = (regnode *) &RExC_emit_dummy;
6499 RExC_whilem_seen = 0;
6500 RExC_open_parens = NULL;
6501 RExC_close_parens = NULL;
6503 RExC_paren_names = NULL;
6505 RExC_paren_name_list = NULL;
6507 RExC_recurse = NULL;
6508 RExC_study_chunk_recursed = NULL;
6509 RExC_study_chunk_recursed_bytes= 0;
6510 RExC_recurse_count = 0;
6511 pRExC_state->code_index = 0;
6513 #if 0 /* REGC() is (currently) a NOP at the first pass.
6514 * Clever compilers notice this and complain. --jhi */
6515 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6518 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6520 RExC_lastparse=NULL;
6522 /* reg may croak on us, not giving us a chance to free
6523 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6524 need it to survive as long as the regexp (qr/(?{})/).
6525 We must check that code_blocksv is not already set, because we may
6526 have jumped back to restart the sizing pass. */
6527 if (pRExC_state->code_blocks && !code_blocksv) {
6528 code_blocksv = newSV_type(SVt_PV);
6529 SAVEFREESV(code_blocksv);
6530 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6531 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6533 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6534 /* It's possible to write a regexp in ascii that represents Unicode
6535 codepoints outside of the byte range, such as via \x{100}. If we
6536 detect such a sequence we have to convert the entire pattern to utf8
6537 and then recompile, as our sizing calculation will have been based
6538 on 1 byte == 1 character, but we will need to use utf8 to encode
6539 at least some part of the pattern, and therefore must convert the whole
6542 if (flags & RESTART_UTF8) {
6543 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6544 pRExC_state->num_code_blocks);
6545 goto redo_first_pass;
6547 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6550 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6553 PerlIO_printf(Perl_debug_log,
6554 "Required size %"IVdf" nodes\n"
6555 "Starting second pass (creation)\n",
6558 RExC_lastparse=NULL;
6561 /* The first pass could have found things that force Unicode semantics */
6562 if ((RExC_utf8 || RExC_uni_semantics)
6563 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6565 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6568 /* Small enough for pointer-storage convention?
6569 If extralen==0, this means that we will not need long jumps. */
6570 if (RExC_size >= 0x10000L && RExC_extralen)
6571 RExC_size += RExC_extralen;
6574 if (RExC_whilem_seen > 15)
6575 RExC_whilem_seen = 15;
6577 /* Allocate space and zero-initialize. Note, the two step process
6578 of zeroing when in debug mode, thus anything assigned has to
6579 happen after that */
6580 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6582 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6583 char, regexp_internal);
6584 if ( r == NULL || ri == NULL )
6585 FAIL("Regexp out of space");
6587 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6588 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6591 /* bulk initialize base fields with 0. */
6592 Zero(ri, sizeof(regexp_internal), char);
6595 /* non-zero initialization begins here */
6598 r->extflags = rx_flags;
6599 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6601 if (pm_flags & PMf_IS_QR) {
6602 ri->code_blocks = pRExC_state->code_blocks;
6603 ri->num_code_blocks = pRExC_state->num_code_blocks;
6608 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6609 if (pRExC_state->code_blocks[n].src_regex)
6610 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6611 SAVEFREEPV(pRExC_state->code_blocks);
6615 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6616 bool has_charset = (get_regex_charset(r->extflags)
6617 != REGEX_DEPENDS_CHARSET);
6619 /* The caret is output if there are any defaults: if not all the STD
6620 * flags are set, or if no character set specifier is needed */
6622 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6624 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6625 == REG_RUN_ON_COMMENT_SEEN);
6626 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6627 >> RXf_PMf_STD_PMMOD_SHIFT);
6628 const char *fptr = STD_PAT_MODS; /*"msix"*/
6630 /* Allocate for the worst case, which is all the std flags are turned
6631 * on. If more precision is desired, we could do a population count of
6632 * the flags set. This could be done with a small lookup table, or by
6633 * shifting, masking and adding, or even, when available, assembly
6634 * language for a machine-language population count.
6635 * We never output a minus, as all those are defaults, so are
6636 * covered by the caret */
6637 const STRLEN wraplen = plen + has_p + has_runon
6638 + has_default /* If needs a caret */
6640 /* If needs a character set specifier */
6641 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6642 + (sizeof(STD_PAT_MODS) - 1)
6643 + (sizeof("(?:)") - 1);
6645 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6646 r->xpv_len_u.xpvlenu_pv = p;
6648 SvFLAGS(rx) |= SVf_UTF8;
6651 /* If a default, cover it using the caret */
6653 *p++= DEFAULT_PAT_MOD;
6657 const char* const name = get_regex_charset_name(r->extflags, &len);
6658 Copy(name, p, len, char);
6662 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6665 while((ch = *fptr++)) {
6673 Copy(RExC_precomp, p, plen, char);
6674 assert ((RX_WRAPPED(rx) - p) < 16);
6675 r->pre_prefix = p - RX_WRAPPED(rx);
6681 SvCUR_set(rx, p - RX_WRAPPED(rx));
6685 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6687 /* setup various meta data about recursion, this all requires
6688 * RExC_npar to be correctly set, and a bit later on we clear it */
6689 if (RExC_seen & REG_RECURSE_SEEN) {
6690 Newxz(RExC_open_parens, RExC_npar,regnode *);
6691 SAVEFREEPV(RExC_open_parens);
6692 Newxz(RExC_close_parens,RExC_npar,regnode *);
6693 SAVEFREEPV(RExC_close_parens);
6695 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6696 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6697 * So its 1 if there are no parens. */
6698 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6699 ((RExC_npar & 0x07) != 0);
6700 Newx(RExC_study_chunk_recursed,
6701 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6702 SAVEFREEPV(RExC_study_chunk_recursed);
6705 /* Useful during FAIL. */
6706 #ifdef RE_TRACK_PATTERN_OFFSETS
6707 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6708 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6709 "%s %"UVuf" bytes for offset annotations.\n",
6710 ri->u.offsets ? "Got" : "Couldn't get",
6711 (UV)((2*RExC_size+1) * sizeof(U32))));
6713 SetProgLen(ri,RExC_size);
6717 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6719 /* Second pass: emit code. */
6720 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6721 RExC_pm_flags = pm_flags;
6723 RExC_end = exp + plen;
6726 RExC_emit_start = ri->program;
6727 RExC_emit = ri->program;
6728 RExC_emit_bound = ri->program + RExC_size + 1;
6729 pRExC_state->code_index = 0;
6731 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6732 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6734 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6736 /* XXXX To minimize changes to RE engine we always allocate
6737 3-units-long substrs field. */
6738 Newx(r->substrs, 1, struct reg_substr_data);
6739 if (RExC_recurse_count) {
6740 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6741 SAVEFREEPV(RExC_recurse);
6745 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6746 Zero(r->substrs, 1, struct reg_substr_data);
6747 if (RExC_study_chunk_recursed)
6748 Zero(RExC_study_chunk_recursed,
6749 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6751 #ifdef TRIE_STUDY_OPT
6753 StructCopy(&zero_scan_data, &data, scan_data_t);
6754 copyRExC_state = RExC_state;
6757 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6759 RExC_state = copyRExC_state;
6760 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6761 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6763 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6764 StructCopy(&zero_scan_data, &data, scan_data_t);
6767 StructCopy(&zero_scan_data, &data, scan_data_t);
6770 /* Dig out information for optimizations. */
6771 r->extflags = RExC_flags; /* was pm_op */
6772 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6775 SvUTF8_on(rx); /* Unicode in it? */
6776 ri->regstclass = NULL;
6777 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6778 r->intflags |= PREGf_NAUGHTY;
6779 scan = ri->program + 1; /* First BRANCH. */
6781 /* testing for BRANCH here tells us whether there is "must appear"
6782 data in the pattern. If there is then we can use it for optimisations */
6783 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6786 STRLEN longest_float_length, longest_fixed_length;
6787 regnode_ssc ch_class; /* pointed to by data */
6789 SSize_t last_close = 0; /* pointed to by data */
6790 regnode *first= scan;
6791 regnode *first_next= regnext(first);
6793 * Skip introductions and multiplicators >= 1
6794 * so that we can extract the 'meat' of the pattern that must
6795 * match in the large if() sequence following.
6796 * NOTE that EXACT is NOT covered here, as it is normally
6797 * picked up by the optimiser separately.
6799 * This is unfortunate as the optimiser isnt handling lookahead
6800 * properly currently.
6803 while ((OP(first) == OPEN && (sawopen = 1)) ||
6804 /* An OR of *one* alternative - should not happen now. */
6805 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6806 /* for now we can't handle lookbehind IFMATCH*/
6807 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6808 (OP(first) == PLUS) ||
6809 (OP(first) == MINMOD) ||
6810 /* An {n,m} with n>0 */
6811 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6812 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6815 * the only op that could be a regnode is PLUS, all the rest
6816 * will be regnode_1 or regnode_2.
6818 * (yves doesn't think this is true)
6820 if (OP(first) == PLUS)
6823 if (OP(first) == MINMOD)
6825 first += regarglen[OP(first)];
6827 first = NEXTOPER(first);
6828 first_next= regnext(first);
6831 /* Starting-point info. */
6833 DEBUG_PEEP("first:",first,0);
6834 /* Ignore EXACT as we deal with it later. */
6835 if (PL_regkind[OP(first)] == EXACT) {
6836 if (OP(first) == EXACT)
6837 NOOP; /* Empty, get anchored substr later. */
6839 ri->regstclass = first;
6842 else if (PL_regkind[OP(first)] == TRIE &&
6843 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6845 /* this can happen only on restudy */
6846 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6849 else if (REGNODE_SIMPLE(OP(first)))
6850 ri->regstclass = first;
6851 else if (PL_regkind[OP(first)] == BOUND ||
6852 PL_regkind[OP(first)] == NBOUND)
6853 ri->regstclass = first;
6854 else if (PL_regkind[OP(first)] == BOL) {
6855 r->intflags |= (OP(first) == MBOL
6857 : (OP(first) == SBOL
6860 first = NEXTOPER(first);
6863 else if (OP(first) == GPOS) {
6864 r->intflags |= PREGf_ANCH_GPOS;
6865 first = NEXTOPER(first);
6868 else if ((!sawopen || !RExC_sawback) &&
6869 (OP(first) == STAR &&
6870 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6871 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6873 /* turn .* into ^.* with an implied $*=1 */
6875 (OP(NEXTOPER(first)) == REG_ANY)
6878 r->intflags |= (type | PREGf_IMPLICIT);
6879 first = NEXTOPER(first);
6882 if (sawplus && !sawminmod && !sawlookahead
6883 && (!sawopen || !RExC_sawback)
6884 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6885 /* x+ must match at the 1st pos of run of x's */
6886 r->intflags |= PREGf_SKIP;
6888 /* Scan is after the zeroth branch, first is atomic matcher. */
6889 #ifdef TRIE_STUDY_OPT
6892 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6893 (IV)(first - scan + 1))
6897 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6898 (IV)(first - scan + 1))
6904 * If there's something expensive in the r.e., find the
6905 * longest literal string that must appear and make it the
6906 * regmust. Resolve ties in favor of later strings, since
6907 * the regstart check works with the beginning of the r.e.
6908 * and avoiding duplication strengthens checking. Not a
6909 * strong reason, but sufficient in the absence of others.
6910 * [Now we resolve ties in favor of the earlier string if
6911 * it happens that c_offset_min has been invalidated, since the
6912 * earlier string may buy us something the later one won't.]
6915 data.longest_fixed = newSVpvs("");
6916 data.longest_float = newSVpvs("");
6917 data.last_found = newSVpvs("");
6918 data.longest = &(data.longest_fixed);
6919 ENTER_with_name("study_chunk");
6920 SAVEFREESV(data.longest_fixed);
6921 SAVEFREESV(data.longest_float);
6922 SAVEFREESV(data.last_found);
6924 if (!ri->regstclass) {
6925 ssc_init(pRExC_state, &ch_class);
6926 data.start_class = &ch_class;
6927 stclass_flag = SCF_DO_STCLASS_AND;
6928 } else /* XXXX Check for BOUND? */
6930 data.last_closep = &last_close;
6933 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6934 scan + RExC_size, /* Up to end */
6936 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6937 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6941 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6944 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6945 && data.last_start_min == 0 && data.last_end > 0
6946 && !RExC_seen_zerolen
6947 && !(RExC_seen & REG_VERBARG_SEEN)
6948 && !(RExC_seen & REG_GPOS_SEEN)
6950 r->extflags |= RXf_CHECK_ALL;
6952 scan_commit(pRExC_state, &data,&minlen,0);
6954 longest_float_length = CHR_SVLEN(data.longest_float);
6956 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6957 && data.offset_fixed == data.offset_float_min
6958 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6959 && S_setup_longest (aTHX_ pRExC_state,
6963 &(r->float_end_shift),
6964 data.lookbehind_float,
6965 data.offset_float_min,
6967 longest_float_length,
6968 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6969 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6971 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6972 r->float_max_offset = data.offset_float_max;
6973 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6974 r->float_max_offset -= data.lookbehind_float;
6975 SvREFCNT_inc_simple_void_NN(data.longest_float);
6978 r->float_substr = r->float_utf8 = NULL;
6979 longest_float_length = 0;
6982 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
6984 if (S_setup_longest (aTHX_ pRExC_state,
6986 &(r->anchored_utf8),
6987 &(r->anchored_substr),
6988 &(r->anchored_end_shift),
6989 data.lookbehind_fixed,
6992 longest_fixed_length,
6993 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
6994 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
6996 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
6997 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7000 r->anchored_substr = r->anchored_utf8 = NULL;
7001 longest_fixed_length = 0;
7003 LEAVE_with_name("study_chunk");
7006 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7007 ri->regstclass = NULL;
7009 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7011 && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7012 && !ssc_is_anything(data.start_class))
7014 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7016 ssc_finalize(pRExC_state, data.start_class);
7018 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7019 StructCopy(data.start_class,
7020 (regnode_ssc*)RExC_rxi->data->data[n],
7022 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7023 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7024 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7025 regprop(r, sv, (regnode*)data.start_class, NULL);
7026 PerlIO_printf(Perl_debug_log,
7027 "synthetic stclass \"%s\".\n",
7028 SvPVX_const(sv));});
7029 data.start_class = NULL;
7032 /* A temporary algorithm prefers floated substr to fixed one to dig
7034 if (longest_fixed_length > longest_float_length) {
7035 r->substrs->check_ix = 0;
7036 r->check_end_shift = r->anchored_end_shift;
7037 r->check_substr = r->anchored_substr;
7038 r->check_utf8 = r->anchored_utf8;
7039 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7040 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7041 r->intflags |= PREGf_NOSCAN;
7044 r->substrs->check_ix = 1;
7045 r->check_end_shift = r->float_end_shift;
7046 r->check_substr = r->float_substr;
7047 r->check_utf8 = r->float_utf8;
7048 r->check_offset_min = r->float_min_offset;
7049 r->check_offset_max = r->float_max_offset;
7051 if ((r->check_substr || r->check_utf8) ) {
7052 r->extflags |= RXf_USE_INTUIT;
7053 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7054 r->extflags |= RXf_INTUIT_TAIL;
7056 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7058 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7059 if ( (STRLEN)minlen < longest_float_length )
7060 minlen= longest_float_length;
7061 if ( (STRLEN)minlen < longest_fixed_length )
7062 minlen= longest_fixed_length;
7066 /* Several toplevels. Best we can is to set minlen. */
7068 regnode_ssc ch_class;
7069 SSize_t last_close = 0;
7071 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7073 scan = ri->program + 1;
7074 ssc_init(pRExC_state, &ch_class);
7075 data.start_class = &ch_class;
7076 data.last_closep = &last_close;
7079 minlen = study_chunk(pRExC_state,
7080 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7081 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7082 ? SCF_TRIE_DOING_RESTUDY
7086 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7088 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7089 = r->float_substr = r->float_utf8 = NULL;
7091 if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
7092 && ! ssc_is_anything(data.start_class))
7094 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7096 ssc_finalize(pRExC_state, data.start_class);
7098 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7099 StructCopy(data.start_class,
7100 (regnode_ssc*)RExC_rxi->data->data[n],
7102 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7103 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7104 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7105 regprop(r, sv, (regnode*)data.start_class, NULL);
7106 PerlIO_printf(Perl_debug_log,
7107 "synthetic stclass \"%s\".\n",
7108 SvPVX_const(sv));});
7109 data.start_class = NULL;
7113 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7114 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7115 r->maxlen = REG_INFTY;
7118 r->maxlen = RExC_maxlen;
7121 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7122 the "real" pattern. */
7124 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n",
7125 (IV)minlen, (IV)r->minlen, RExC_maxlen);
7127 r->minlenret = minlen;
7128 if (r->minlen < minlen)
7131 if (RExC_seen & REG_GPOS_SEEN)
7132 r->intflags |= PREGf_GPOS_SEEN;
7133 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7134 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7136 if (pRExC_state->num_code_blocks)
7137 r->extflags |= RXf_EVAL_SEEN;
7138 if (RExC_seen & REG_CANY_SEEN)
7139 r->intflags |= PREGf_CANY_SEEN;
7140 if (RExC_seen & REG_VERBARG_SEEN)
7142 r->intflags |= PREGf_VERBARG_SEEN;
7143 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7145 if (RExC_seen & REG_CUTGROUP_SEEN)
7146 r->intflags |= PREGf_CUTGROUP_SEEN;
7147 if (pm_flags & PMf_USE_RE_EVAL)
7148 r->intflags |= PREGf_USE_RE_EVAL;
7149 if (RExC_paren_names)
7150 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7152 RXp_PAREN_NAMES(r) = NULL;
7154 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7155 * so it can be used in pp.c */
7156 if (r->intflags & PREGf_ANCH)
7157 r->extflags |= RXf_IS_ANCHORED;
7161 /* this is used to identify "special" patterns that might result
7162 * in Perl NOT calling the regex engine and instead doing the match "itself",
7163 * particularly special cases in split//. By having the regex compiler
7164 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7165 * we avoid weird issues with equivalent patterns resulting in different behavior,
7166 * AND we allow non Perl engines to get the same optimizations by the setting the
7167 * flags appropriately - Yves */
7168 regnode *first = ri->program + 1;
7170 regnode *next = NEXTOPER(first);
7173 if (PL_regkind[fop] == NOTHING && nop == END)
7174 r->extflags |= RXf_NULL;
7175 else if (PL_regkind[fop] == BOL && nop == END)
7176 r->extflags |= RXf_START_ONLY;
7177 else if (fop == PLUS
7178 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7179 && OP(regnext(first)) == END)
7180 r->extflags |= RXf_WHITE;
7181 else if ( r->extflags & RXf_SPLIT
7183 && STR_LEN(first) == 1
7184 && *(STRING(first)) == ' '
7185 && OP(regnext(first)) == END )
7186 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7190 if (RExC_contains_locale) {
7191 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7195 if (RExC_paren_names) {
7196 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7197 ri->data->data[ri->name_list_idx]
7198 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7201 ri->name_list_idx = 0;
7203 if (RExC_recurse_count) {
7204 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7205 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7206 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7209 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7210 /* assume we don't need to swap parens around before we match */
7214 PerlIO_printf(Perl_debug_log,"Final program:\n");
7217 #ifdef RE_TRACK_PATTERN_OFFSETS
7218 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7219 const STRLEN len = ri->u.offsets[0];
7221 GET_RE_DEBUG_FLAGS_DECL;
7222 PerlIO_printf(Perl_debug_log,
7223 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7224 for (i = 1; i <= len; i++) {
7225 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7226 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7227 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7229 PerlIO_printf(Perl_debug_log, "\n");
7234 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7235 * by setting the regexp SV to readonly-only instead. If the
7236 * pattern's been recompiled, the USEDness should remain. */
7237 if (old_re && SvREADONLY(old_re))
7245 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7248 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7250 PERL_UNUSED_ARG(value);
7252 if (flags & RXapif_FETCH) {
7253 return reg_named_buff_fetch(rx, key, flags);
7254 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7255 Perl_croak_no_modify();
7257 } else if (flags & RXapif_EXISTS) {
7258 return reg_named_buff_exists(rx, key, flags)
7261 } else if (flags & RXapif_REGNAMES) {
7262 return reg_named_buff_all(rx, flags);
7263 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7264 return reg_named_buff_scalar(rx, flags);
7266 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7272 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7275 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7276 PERL_UNUSED_ARG(lastkey);
7278 if (flags & RXapif_FIRSTKEY)
7279 return reg_named_buff_firstkey(rx, flags);
7280 else if (flags & RXapif_NEXTKEY)
7281 return reg_named_buff_nextkey(rx, flags);
7283 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7290 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7293 AV *retarray = NULL;
7295 struct regexp *const rx = ReANY(r);
7297 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7299 if (flags & RXapif_ALL)
7302 if (rx && RXp_PAREN_NAMES(rx)) {
7303 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7306 SV* sv_dat=HeVAL(he_str);
7307 I32 *nums=(I32*)SvPVX(sv_dat);
7308 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7309 if ((I32)(rx->nparens) >= nums[i]
7310 && rx->offs[nums[i]].start != -1
7311 && rx->offs[nums[i]].end != -1)
7314 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7319 ret = newSVsv(&PL_sv_undef);
7322 av_push(retarray, ret);
7325 return newRV_noinc(MUTABLE_SV(retarray));
7332 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7335 struct regexp *const rx = ReANY(r);
7337 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7339 if (rx && RXp_PAREN_NAMES(rx)) {
7340 if (flags & RXapif_ALL) {
7341 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7343 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7345 SvREFCNT_dec_NN(sv);
7357 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7359 struct regexp *const rx = ReANY(r);
7361 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7363 if ( rx && RXp_PAREN_NAMES(rx) ) {
7364 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7366 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7373 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7375 struct regexp *const rx = ReANY(r);
7376 GET_RE_DEBUG_FLAGS_DECL;
7378 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7380 if (rx && RXp_PAREN_NAMES(rx)) {
7381 HV *hv = RXp_PAREN_NAMES(rx);
7383 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7386 SV* sv_dat = HeVAL(temphe);
7387 I32 *nums = (I32*)SvPVX(sv_dat);
7388 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7389 if ((I32)(rx->lastparen) >= nums[i] &&
7390 rx->offs[nums[i]].start != -1 &&
7391 rx->offs[nums[i]].end != -1)
7397 if (parno || flags & RXapif_ALL) {
7398 return newSVhek(HeKEY_hek(temphe));
7406 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7411 struct regexp *const rx = ReANY(r);
7413 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7415 if (rx && RXp_PAREN_NAMES(rx)) {
7416 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7417 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7418 } else if (flags & RXapif_ONE) {
7419 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7420 av = MUTABLE_AV(SvRV(ret));
7421 length = av_tindex(av);
7422 SvREFCNT_dec_NN(ret);
7423 return newSViv(length + 1);
7425 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7430 return &PL_sv_undef;
7434 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7436 struct regexp *const rx = ReANY(r);
7439 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7441 if (rx && RXp_PAREN_NAMES(rx)) {
7442 HV *hv= RXp_PAREN_NAMES(rx);
7444 (void)hv_iterinit(hv);
7445 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7448 SV* sv_dat = HeVAL(temphe);
7449 I32 *nums = (I32*)SvPVX(sv_dat);
7450 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7451 if ((I32)(rx->lastparen) >= nums[i] &&
7452 rx->offs[nums[i]].start != -1 &&
7453 rx->offs[nums[i]].end != -1)
7459 if (parno || flags & RXapif_ALL) {
7460 av_push(av, newSVhek(HeKEY_hek(temphe)));
7465 return newRV_noinc(MUTABLE_SV(av));
7469 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7472 struct regexp *const rx = ReANY(r);
7478 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7480 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7481 || n == RX_BUFF_IDX_CARET_FULLMATCH
7482 || n == RX_BUFF_IDX_CARET_POSTMATCH
7485 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7487 /* on something like
7490 * the KEEPCOPY is set on the PMOP rather than the regex */
7491 if (PL_curpm && r == PM_GETRE(PL_curpm))
7492 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7501 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7502 /* no need to distinguish between them any more */
7503 n = RX_BUFF_IDX_FULLMATCH;
7505 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7506 && rx->offs[0].start != -1)
7508 /* $`, ${^PREMATCH} */
7509 i = rx->offs[0].start;
7513 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7514 && rx->offs[0].end != -1)
7516 /* $', ${^POSTMATCH} */
7517 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7518 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7521 if ( 0 <= n && n <= (I32)rx->nparens &&
7522 (s1 = rx->offs[n].start) != -1 &&
7523 (t1 = rx->offs[n].end) != -1)
7525 /* $&, ${^MATCH}, $1 ... */
7527 s = rx->subbeg + s1 - rx->suboffset;
7532 assert(s >= rx->subbeg);
7533 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7535 #ifdef NO_TAINT_SUPPORT
7536 sv_setpvn(sv, s, i);
7538 const int oldtainted = TAINT_get;
7540 sv_setpvn(sv, s, i);
7541 TAINT_set(oldtainted);
7543 if ( (rx->intflags & PREGf_CANY_SEEN)
7544 ? (RXp_MATCH_UTF8(rx)
7545 && (!i || is_utf8_string((U8*)s, i)))
7546 : (RXp_MATCH_UTF8(rx)) )
7553 if (RXp_MATCH_TAINTED(rx)) {
7554 if (SvTYPE(sv) >= SVt_PVMG) {
7555 MAGIC* const mg = SvMAGIC(sv);
7558 SvMAGIC_set(sv, mg->mg_moremagic);
7560 if ((mgt = SvMAGIC(sv))) {
7561 mg->mg_moremagic = mgt;
7562 SvMAGIC_set(sv, mg);
7573 sv_setsv(sv,&PL_sv_undef);
7579 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7580 SV const * const value)
7582 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7584 PERL_UNUSED_ARG(rx);
7585 PERL_UNUSED_ARG(paren);
7586 PERL_UNUSED_ARG(value);
7589 Perl_croak_no_modify();
7593 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7596 struct regexp *const rx = ReANY(r);
7600 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7602 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7603 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7604 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7607 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7609 /* on something like
7612 * the KEEPCOPY is set on the PMOP rather than the regex */
7613 if (PL_curpm && r == PM_GETRE(PL_curpm))
7614 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7620 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7622 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7623 case RX_BUFF_IDX_PREMATCH: /* $` */
7624 if (rx->offs[0].start != -1) {
7625 i = rx->offs[0].start;
7634 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7635 case RX_BUFF_IDX_POSTMATCH: /* $' */
7636 if (rx->offs[0].end != -1) {
7637 i = rx->sublen - rx->offs[0].end;
7639 s1 = rx->offs[0].end;
7646 default: /* $& / ${^MATCH}, $1, $2, ... */
7647 if (paren <= (I32)rx->nparens &&
7648 (s1 = rx->offs[paren].start) != -1 &&
7649 (t1 = rx->offs[paren].end) != -1)
7655 if (ckWARN(WARN_UNINITIALIZED))
7656 report_uninit((const SV *)sv);
7661 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7662 const char * const s = rx->subbeg - rx->suboffset + s1;
7667 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7674 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7676 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7677 PERL_UNUSED_ARG(rx);
7681 return newSVpvs("Regexp");
7684 /* Scans the name of a named buffer from the pattern.
7685 * If flags is REG_RSN_RETURN_NULL returns null.
7686 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7687 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7688 * to the parsed name as looked up in the RExC_paren_names hash.
7689 * If there is an error throws a vFAIL().. type exception.
7692 #define REG_RSN_RETURN_NULL 0
7693 #define REG_RSN_RETURN_NAME 1
7694 #define REG_RSN_RETURN_DATA 2
7697 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7699 char *name_start = RExC_parse;
7701 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7703 assert (RExC_parse <= RExC_end);
7704 if (RExC_parse == RExC_end) NOOP;
7705 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7706 /* skip IDFIRST by using do...while */
7709 RExC_parse += UTF8SKIP(RExC_parse);
7710 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7714 } while (isWORDCHAR(*RExC_parse));
7716 RExC_parse++; /* so the <- from the vFAIL is after the offending
7718 vFAIL("Group name must start with a non-digit word character");
7722 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7723 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7724 if ( flags == REG_RSN_RETURN_NAME)
7726 else if (flags==REG_RSN_RETURN_DATA) {
7729 if ( ! sv_name ) /* should not happen*/
7730 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7731 if (RExC_paren_names)
7732 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7734 sv_dat = HeVAL(he_str);
7736 vFAIL("Reference to nonexistent named group");
7740 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7741 (unsigned long) flags);
7743 assert(0); /* NOT REACHED */
7748 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7749 int rem=(int)(RExC_end - RExC_parse); \
7758 if (RExC_lastparse!=RExC_parse) \
7759 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7762 iscut ? "..." : "<" \
7765 PerlIO_printf(Perl_debug_log,"%16s",""); \
7768 num = RExC_size + 1; \
7770 num=REG_NODE_NUM(RExC_emit); \
7771 if (RExC_lastnum!=num) \
7772 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7774 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7775 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7776 (int)((depth*2)), "", \
7780 RExC_lastparse=RExC_parse; \
7785 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7786 DEBUG_PARSE_MSG((funcname)); \
7787 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7789 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7790 DEBUG_PARSE_MSG((funcname)); \
7791 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7794 /* This section of code defines the inversion list object and its methods. The
7795 * interfaces are highly subject to change, so as much as possible is static to
7796 * this file. An inversion list is here implemented as a malloc'd C UV array
7797 * as an SVt_INVLIST scalar.
7799 * An inversion list for Unicode is an array of code points, sorted by ordinal
7800 * number. The zeroth element is the first code point in the list. The 1th
7801 * element is the first element beyond that not in the list. In other words,
7802 * the first range is
7803 * invlist[0]..(invlist[1]-1)
7804 * The other ranges follow. Thus every element whose index is divisible by two
7805 * marks the beginning of a range that is in the list, and every element not
7806 * divisible by two marks the beginning of a range not in the list. A single
7807 * element inversion list that contains the single code point N generally
7808 * consists of two elements
7811 * (The exception is when N is the highest representable value on the
7812 * machine, in which case the list containing just it would be a single
7813 * element, itself. By extension, if the last range in the list extends to
7814 * infinity, then the first element of that range will be in the inversion list
7815 * at a position that is divisible by two, and is the final element in the
7817 * Taking the complement (inverting) an inversion list is quite simple, if the
7818 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7819 * This implementation reserves an element at the beginning of each inversion
7820 * list to always contain 0; there is an additional flag in the header which
7821 * indicates if the list begins at the 0, or is offset to begin at the next
7824 * More about inversion lists can be found in "Unicode Demystified"
7825 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7826 * More will be coming when functionality is added later.
7828 * The inversion list data structure is currently implemented as an SV pointing
7829 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7830 * array of UV whose memory management is automatically handled by the existing
7831 * facilities for SV's.
7833 * Some of the methods should always be private to the implementation, and some
7834 * should eventually be made public */
7836 /* The header definitions are in F<inline_invlist.c> */
7838 PERL_STATIC_INLINE UV*
7839 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7841 /* Returns a pointer to the first element in the inversion list's array.
7842 * This is called upon initialization of an inversion list. Where the
7843 * array begins depends on whether the list has the code point U+0000 in it
7844 * or not. The other parameter tells it whether the code that follows this
7845 * call is about to put a 0 in the inversion list or not. The first
7846 * element is either the element reserved for 0, if TRUE, or the element
7847 * after it, if FALSE */
7849 bool* offset = get_invlist_offset_addr(invlist);
7850 UV* zero_addr = (UV *) SvPVX(invlist);
7852 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7855 assert(! _invlist_len(invlist));
7859 /* 1^1 = 0; 1^0 = 1 */
7860 *offset = 1 ^ will_have_0;
7861 return zero_addr + *offset;
7864 PERL_STATIC_INLINE UV*
7865 S_invlist_array(SV* const invlist)
7867 /* Returns the pointer to the inversion list's array. Every time the
7868 * length changes, this needs to be called in case malloc or realloc moved
7871 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7873 /* Must not be empty. If these fail, you probably didn't check for <len>
7874 * being non-zero before trying to get the array */
7875 assert(_invlist_len(invlist));
7877 /* The very first element always contains zero, The array begins either
7878 * there, or if the inversion list is offset, at the element after it.
7879 * The offset header field determines which; it contains 0 or 1 to indicate
7880 * how much additionally to add */
7881 assert(0 == *(SvPVX(invlist)));
7882 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7885 PERL_STATIC_INLINE void
7886 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7888 /* Sets the current number of elements stored in the inversion list.
7889 * Updates SvCUR correspondingly */
7890 PERL_UNUSED_CONTEXT;
7891 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7893 assert(SvTYPE(invlist) == SVt_INVLIST);
7898 : TO_INTERNAL_SIZE(len + offset));
7899 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7902 PERL_STATIC_INLINE IV*
7903 S_get_invlist_previous_index_addr(SV* invlist)
7905 /* Return the address of the IV that is reserved to hold the cached index
7907 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7909 assert(SvTYPE(invlist) == SVt_INVLIST);
7911 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7914 PERL_STATIC_INLINE IV
7915 S_invlist_previous_index(SV* const invlist)
7917 /* Returns cached index of previous search */
7919 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7921 return *get_invlist_previous_index_addr(invlist);
7924 PERL_STATIC_INLINE void
7925 S_invlist_set_previous_index(SV* const invlist, const IV index)
7927 /* Caches <index> for later retrieval */
7929 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7931 assert(index == 0 || index < (int) _invlist_len(invlist));
7933 *get_invlist_previous_index_addr(invlist) = index;
7936 PERL_STATIC_INLINE UV
7937 S_invlist_max(SV* const invlist)
7939 /* Returns the maximum number of elements storable in the inversion list's
7940 * array, without having to realloc() */
7942 PERL_ARGS_ASSERT_INVLIST_MAX;
7944 assert(SvTYPE(invlist) == SVt_INVLIST);
7946 /* Assumes worst case, in which the 0 element is not counted in the
7947 * inversion list, so subtracts 1 for that */
7948 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7949 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7950 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7953 #ifndef PERL_IN_XSUB_RE
7955 Perl__new_invlist(pTHX_ IV initial_size)
7958 /* Return a pointer to a newly constructed inversion list, with enough
7959 * space to store 'initial_size' elements. If that number is negative, a
7960 * system default is used instead */
7964 if (initial_size < 0) {
7968 /* Allocate the initial space */
7969 new_list = newSV_type(SVt_INVLIST);
7971 /* First 1 is in case the zero element isn't in the list; second 1 is for
7973 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7974 invlist_set_len(new_list, 0, 0);
7976 /* Force iterinit() to be used to get iteration to work */
7977 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
7979 *get_invlist_previous_index_addr(new_list) = 0;
7985 Perl__new_invlist_C_array(pTHX_ const UV* const list)
7987 /* Return a pointer to a newly constructed inversion list, initialized to
7988 * point to <list>, which has to be in the exact correct inversion list
7989 * form, including internal fields. Thus this is a dangerous routine that
7990 * should not be used in the wrong hands. The passed in 'list' contains
7991 * several header fields at the beginning that are not part of the
7992 * inversion list body proper */
7994 const STRLEN length = (STRLEN) list[0];
7995 const UV version_id = list[1];
7996 const bool offset = cBOOL(list[2]);
7997 #define HEADER_LENGTH 3
7998 /* If any of the above changes in any way, you must change HEADER_LENGTH
7999 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8000 * perl -E 'say int(rand 2**31-1)'
8002 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8003 data structure type, so that one being
8004 passed in can be validated to be an
8005 inversion list of the correct vintage.
8008 SV* invlist = newSV_type(SVt_INVLIST);
8010 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8012 if (version_id != INVLIST_VERSION_ID) {
8013 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8016 /* The generated array passed in includes header elements that aren't part
8017 * of the list proper, so start it just after them */
8018 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8020 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8021 shouldn't touch it */
8023 *(get_invlist_offset_addr(invlist)) = offset;
8025 /* The 'length' passed to us is the physical number of elements in the
8026 * inversion list. But if there is an offset the logical number is one
8028 invlist_set_len(invlist, length - offset, offset);
8030 invlist_set_previous_index(invlist, 0);
8032 /* Initialize the iteration pointer. */
8033 invlist_iterfinish(invlist);
8035 SvREADONLY_on(invlist);
8039 #endif /* ifndef PERL_IN_XSUB_RE */
8042 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8044 /* Grow the maximum size of an inversion list */
8046 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8048 assert(SvTYPE(invlist) == SVt_INVLIST);
8050 /* Add one to account for the zero element at the beginning which may not
8051 * be counted by the calling parameters */
8052 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8055 PERL_STATIC_INLINE void
8056 S_invlist_trim(SV* const invlist)
8058 PERL_ARGS_ASSERT_INVLIST_TRIM;
8060 assert(SvTYPE(invlist) == SVt_INVLIST);
8062 /* Change the length of the inversion list to how many entries it currently
8064 SvPV_shrink_to_cur((SV *) invlist);
8068 S__append_range_to_invlist(pTHX_ SV* const invlist,
8069 const UV start, const UV end)
8071 /* Subject to change or removal. Append the range from 'start' to 'end' at
8072 * the end of the inversion list. The range must be above any existing
8076 UV max = invlist_max(invlist);
8077 UV len = _invlist_len(invlist);
8080 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8082 if (len == 0) { /* Empty lists must be initialized */
8083 offset = start != 0;
8084 array = _invlist_array_init(invlist, ! offset);
8087 /* Here, the existing list is non-empty. The current max entry in the
8088 * list is generally the first value not in the set, except when the
8089 * set extends to the end of permissible values, in which case it is
8090 * the first entry in that final set, and so this call is an attempt to
8091 * append out-of-order */
8093 UV final_element = len - 1;
8094 array = invlist_array(invlist);
8095 if (array[final_element] > start
8096 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8098 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8099 array[final_element], start,
8100 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8103 /* Here, it is a legal append. If the new range begins with the first
8104 * value not in the set, it is extending the set, so the new first
8105 * value not in the set is one greater than the newly extended range.
8107 offset = *get_invlist_offset_addr(invlist);
8108 if (array[final_element] == start) {
8109 if (end != UV_MAX) {
8110 array[final_element] = end + 1;
8113 /* But if the end is the maximum representable on the machine,
8114 * just let the range that this would extend to have no end */
8115 invlist_set_len(invlist, len - 1, offset);
8121 /* Here the new range doesn't extend any existing set. Add it */
8123 len += 2; /* Includes an element each for the start and end of range */
8125 /* If wll overflow the existing space, extend, which may cause the array to
8128 invlist_extend(invlist, len);
8130 /* Have to set len here to avoid assert failure in invlist_array() */
8131 invlist_set_len(invlist, len, offset);
8133 array = invlist_array(invlist);
8136 invlist_set_len(invlist, len, offset);
8139 /* The next item on the list starts the range, the one after that is
8140 * one past the new range. */
8141 array[len - 2] = start;
8142 if (end != UV_MAX) {
8143 array[len - 1] = end + 1;
8146 /* But if the end is the maximum representable on the machine, just let
8147 * the range have no end */
8148 invlist_set_len(invlist, len - 1, offset);
8152 #ifndef PERL_IN_XSUB_RE
8155 Perl__invlist_search(SV* const invlist, const UV cp)
8157 /* Searches the inversion list for the entry that contains the input code
8158 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8159 * return value is the index into the list's array of the range that
8164 IV high = _invlist_len(invlist);
8165 const IV highest_element = high - 1;
8168 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8170 /* If list is empty, return failure. */
8175 /* (We can't get the array unless we know the list is non-empty) */
8176 array = invlist_array(invlist);
8178 mid = invlist_previous_index(invlist);
8179 assert(mid >=0 && mid <= highest_element);
8181 /* <mid> contains the cache of the result of the previous call to this
8182 * function (0 the first time). See if this call is for the same result,
8183 * or if it is for mid-1. This is under the theory that calls to this
8184 * function will often be for related code points that are near each other.
8185 * And benchmarks show that caching gives better results. We also test
8186 * here if the code point is within the bounds of the list. These tests
8187 * replace others that would have had to be made anyway to make sure that
8188 * the array bounds were not exceeded, and these give us extra information
8189 * at the same time */
8190 if (cp >= array[mid]) {
8191 if (cp >= array[highest_element]) {
8192 return highest_element;
8195 /* Here, array[mid] <= cp < array[highest_element]. This means that
8196 * the final element is not the answer, so can exclude it; it also
8197 * means that <mid> is not the final element, so can refer to 'mid + 1'
8199 if (cp < array[mid + 1]) {
8205 else { /* cp < aray[mid] */
8206 if (cp < array[0]) { /* Fail if outside the array */
8210 if (cp >= array[mid - 1]) {
8215 /* Binary search. What we are looking for is <i> such that
8216 * array[i] <= cp < array[i+1]
8217 * The loop below converges on the i+1. Note that there may not be an
8218 * (i+1)th element in the array, and things work nonetheless */
8219 while (low < high) {
8220 mid = (low + high) / 2;
8221 assert(mid <= highest_element);
8222 if (array[mid] <= cp) { /* cp >= array[mid] */
8225 /* We could do this extra test to exit the loop early.
8226 if (cp < array[low]) {
8231 else { /* cp < array[mid] */
8238 invlist_set_previous_index(invlist, high);
8243 Perl__invlist_populate_swatch(SV* const invlist,
8244 const UV start, const UV end, U8* swatch)
8246 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8247 * but is used when the swash has an inversion list. This makes this much
8248 * faster, as it uses a binary search instead of a linear one. This is
8249 * intimately tied to that function, and perhaps should be in utf8.c,
8250 * except it is intimately tied to inversion lists as well. It assumes
8251 * that <swatch> is all 0's on input */
8254 const IV len = _invlist_len(invlist);
8258 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8260 if (len == 0) { /* Empty inversion list */
8264 array = invlist_array(invlist);
8266 /* Find which element it is */
8267 i = _invlist_search(invlist, start);
8269 /* We populate from <start> to <end> */
8270 while (current < end) {
8273 /* The inversion list gives the results for every possible code point
8274 * after the first one in the list. Only those ranges whose index is
8275 * even are ones that the inversion list matches. For the odd ones,
8276 * and if the initial code point is not in the list, we have to skip
8277 * forward to the next element */
8278 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8280 if (i >= len) { /* Finished if beyond the end of the array */
8284 if (current >= end) { /* Finished if beyond the end of what we
8286 if (LIKELY(end < UV_MAX)) {
8290 /* We get here when the upper bound is the maximum
8291 * representable on the machine, and we are looking for just
8292 * that code point. Have to special case it */
8294 goto join_end_of_list;
8297 assert(current >= start);
8299 /* The current range ends one below the next one, except don't go past
8302 upper = (i < len && array[i] < end) ? array[i] : end;
8304 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8305 * for each code point in it */
8306 for (; current < upper; current++) {
8307 const STRLEN offset = (STRLEN)(current - start);
8308 swatch[offset >> 3] |= 1 << (offset & 7);
8313 /* Quit if at the end of the list */
8316 /* But first, have to deal with the highest possible code point on
8317 * the platform. The previous code assumes that <end> is one
8318 * beyond where we want to populate, but that is impossible at the
8319 * platform's infinity, so have to handle it specially */
8320 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8322 const STRLEN offset = (STRLEN)(end - start);
8323 swatch[offset >> 3] |= 1 << (offset & 7);
8328 /* Advance to the next range, which will be for code points not in the
8337 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8338 const bool complement_b, SV** output)
8340 /* Take the union of two inversion lists and point <output> to it. *output
8341 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8342 * the reference count to that list will be decremented if not already a
8343 * temporary (mortal); otherwise *output will be made correspondingly
8344 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8345 * second list is returned. If <complement_b> is TRUE, the union is taken
8346 * of the complement (inversion) of <b> instead of b itself.
8348 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8349 * Richard Gillam, published by Addison-Wesley, and explained at some
8350 * length there. The preface says to incorporate its examples into your
8351 * code at your own risk.
8353 * The algorithm is like a merge sort.
8355 * XXX A potential performance improvement is to keep track as we go along
8356 * if only one of the inputs contributes to the result, meaning the other
8357 * is a subset of that one. In that case, we can skip the final copy and
8358 * return the larger of the input lists, but then outside code might need
8359 * to keep track of whether to free the input list or not */
8361 const UV* array_a; /* a's array */
8363 UV len_a; /* length of a's array */
8366 SV* u; /* the resulting union */
8370 UV i_a = 0; /* current index into a's array */
8374 /* running count, as explained in the algorithm source book; items are
8375 * stopped accumulating and are output when the count changes to/from 0.
8376 * The count is incremented when we start a range that's in the set, and
8377 * decremented when we start a range that's not in the set. So its range
8378 * is 0 to 2. Only when the count is zero is something not in the set.
8382 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8385 /* If either one is empty, the union is the other one */
8386 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8387 bool make_temp = FALSE; /* Should we mortalize the result? */
8391 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8397 *output = invlist_clone(b);
8399 _invlist_invert(*output);
8401 } /* else *output already = b; */
8404 sv_2mortal(*output);
8408 else if ((len_b = _invlist_len(b)) == 0) {
8409 bool make_temp = FALSE;
8411 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8416 /* The complement of an empty list is a list that has everything in it,
8417 * so the union with <a> includes everything too */
8420 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8424 *output = _new_invlist(1);
8425 _append_range_to_invlist(*output, 0, UV_MAX);
8427 else if (*output != a) {
8428 *output = invlist_clone(a);
8430 /* else *output already = a; */
8433 sv_2mortal(*output);
8438 /* Here both lists exist and are non-empty */
8439 array_a = invlist_array(a);
8440 array_b = invlist_array(b);
8442 /* If are to take the union of 'a' with the complement of b, set it
8443 * up so are looking at b's complement. */
8446 /* To complement, we invert: if the first element is 0, remove it. To
8447 * do this, we just pretend the array starts one later */
8448 if (array_b[0] == 0) {
8454 /* But if the first element is not zero, we pretend the list starts
8455 * at the 0 that is always stored immediately before the array. */
8461 /* Size the union for the worst case: that the sets are completely
8463 u = _new_invlist(len_a + len_b);
8465 /* Will contain U+0000 if either component does */
8466 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8467 || (len_b > 0 && array_b[0] == 0));
8469 /* Go through each list item by item, stopping when exhausted one of
8471 while (i_a < len_a && i_b < len_b) {
8472 UV cp; /* The element to potentially add to the union's array */
8473 bool cp_in_set; /* is it in the the input list's set or not */
8475 /* We need to take one or the other of the two inputs for the union.
8476 * Since we are merging two sorted lists, we take the smaller of the
8477 * next items. In case of a tie, we take the one that is in its set
8478 * first. If we took one not in the set first, it would decrement the
8479 * count, possibly to 0 which would cause it to be output as ending the
8480 * range, and the next time through we would take the same number, and
8481 * output it again as beginning the next range. By doing it the
8482 * opposite way, there is no possibility that the count will be
8483 * momentarily decremented to 0, and thus the two adjoining ranges will
8484 * be seamlessly merged. (In a tie and both are in the set or both not
8485 * in the set, it doesn't matter which we take first.) */
8486 if (array_a[i_a] < array_b[i_b]
8487 || (array_a[i_a] == array_b[i_b]
8488 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8490 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8494 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8495 cp = array_b[i_b++];
8498 /* Here, have chosen which of the two inputs to look at. Only output
8499 * if the running count changes to/from 0, which marks the
8500 * beginning/end of a range in that's in the set */
8503 array_u[i_u++] = cp;
8510 array_u[i_u++] = cp;
8515 /* Here, we are finished going through at least one of the lists, which
8516 * means there is something remaining in at most one. We check if the list
8517 * that hasn't been exhausted is positioned such that we are in the middle
8518 * of a range in its set or not. (i_a and i_b point to the element beyond
8519 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8520 * is potentially more to output.
8521 * There are four cases:
8522 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8523 * in the union is entirely from the non-exhausted set.
8524 * 2) Both were in their sets, count is 2. Nothing further should
8525 * be output, as everything that remains will be in the exhausted
8526 * list's set, hence in the union; decrementing to 1 but not 0 insures
8528 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8529 * Nothing further should be output because the union includes
8530 * everything from the exhausted set. Not decrementing ensures that.
8531 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8532 * decrementing to 0 insures that we look at the remainder of the
8533 * non-exhausted set */
8534 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8535 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8540 /* The final length is what we've output so far, plus what else is about to
8541 * be output. (If 'count' is non-zero, then the input list we exhausted
8542 * has everything remaining up to the machine's limit in its set, and hence
8543 * in the union, so there will be no further output. */
8546 /* At most one of the subexpressions will be non-zero */
8547 len_u += (len_a - i_a) + (len_b - i_b);
8550 /* Set result to final length, which can change the pointer to array_u, so
8552 if (len_u != _invlist_len(u)) {
8553 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8555 array_u = invlist_array(u);
8558 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8559 * the other) ended with everything above it not in its set. That means
8560 * that the remaining part of the union is precisely the same as the
8561 * non-exhausted list, so can just copy it unchanged. (If both list were
8562 * exhausted at the same time, then the operations below will be both 0.)
8565 IV copy_count; /* At most one will have a non-zero copy count */
8566 if ((copy_count = len_a - i_a) > 0) {
8567 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8569 else if ((copy_count = len_b - i_b) > 0) {
8570 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8574 /* We may be removing a reference to one of the inputs. If so, the output
8575 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8576 * count decremented) */
8577 if (a == *output || b == *output) {
8578 assert(! invlist_is_iterating(*output));
8579 if ((SvTEMP(*output))) {
8583 SvREFCNT_dec_NN(*output);
8593 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8594 const bool complement_b, SV** i)
8596 /* Take the intersection of two inversion lists and point <i> to it. *i
8597 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8598 * the reference count to that list will be decremented if not already a
8599 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8600 * The first list, <a>, may be NULL, in which case an empty list is
8601 * returned. If <complement_b> is TRUE, the result will be the
8602 * intersection of <a> and the complement (or inversion) of <b> instead of
8605 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8606 * Richard Gillam, published by Addison-Wesley, and explained at some
8607 * length there. The preface says to incorporate its examples into your
8608 * code at your own risk. In fact, it had bugs
8610 * The algorithm is like a merge sort, and is essentially the same as the
8614 const UV* array_a; /* a's array */
8616 UV len_a; /* length of a's array */
8619 SV* r; /* the resulting intersection */
8623 UV i_a = 0; /* current index into a's array */
8627 /* running count, as explained in the algorithm source book; items are
8628 * stopped accumulating and are output when the count changes to/from 2.
8629 * The count is incremented when we start a range that's in the set, and
8630 * decremented when we start a range that's not in the set. So its range
8631 * is 0 to 2. Only when the count is 2 is something in the intersection.
8635 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8638 /* Special case if either one is empty */
8639 len_a = (a == NULL) ? 0 : _invlist_len(a);
8640 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8641 bool make_temp = FALSE;
8643 if (len_a != 0 && complement_b) {
8645 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8646 * be empty. Here, also we are using 'b's complement, which hence
8647 * must be every possible code point. Thus the intersection is
8651 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8656 *i = invlist_clone(a);
8658 /* else *i is already 'a' */
8666 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8667 * intersection must be empty */
8669 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8674 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8678 *i = _new_invlist(0);
8686 /* Here both lists exist and are non-empty */
8687 array_a = invlist_array(a);
8688 array_b = invlist_array(b);
8690 /* If are to take the intersection of 'a' with the complement of b, set it
8691 * up so are looking at b's complement. */
8694 /* To complement, we invert: if the first element is 0, remove it. To
8695 * do this, we just pretend the array starts one later */
8696 if (array_b[0] == 0) {
8702 /* But if the first element is not zero, we pretend the list starts
8703 * at the 0 that is always stored immediately before the array. */
8709 /* Size the intersection for the worst case: that the intersection ends up
8710 * fragmenting everything to be completely disjoint */
8711 r= _new_invlist(len_a + len_b);
8713 /* Will contain U+0000 iff both components do */
8714 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8715 && len_b > 0 && array_b[0] == 0);
8717 /* Go through each list item by item, stopping when exhausted one of
8719 while (i_a < len_a && i_b < len_b) {
8720 UV cp; /* The element to potentially add to the intersection's
8722 bool cp_in_set; /* Is it in the input list's set or not */
8724 /* We need to take one or the other of the two inputs for the
8725 * intersection. Since we are merging two sorted lists, we take the
8726 * smaller of the next items. In case of a tie, we take the one that
8727 * is not in its set first (a difference from the union algorithm). If
8728 * we took one in the set first, it would increment the count, possibly
8729 * to 2 which would cause it to be output as starting a range in the
8730 * intersection, and the next time through we would take that same
8731 * number, and output it again as ending the set. By doing it the
8732 * opposite of this, there is no possibility that the count will be
8733 * momentarily incremented to 2. (In a tie and both are in the set or
8734 * both not in the set, it doesn't matter which we take first.) */
8735 if (array_a[i_a] < array_b[i_b]
8736 || (array_a[i_a] == array_b[i_b]
8737 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8739 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8743 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8747 /* Here, have chosen which of the two inputs to look at. Only output
8748 * if the running count changes to/from 2, which marks the
8749 * beginning/end of a range that's in the intersection */
8753 array_r[i_r++] = cp;
8758 array_r[i_r++] = cp;
8764 /* Here, we are finished going through at least one of the lists, which
8765 * means there is something remaining in at most one. We check if the list
8766 * that has been exhausted is positioned such that we are in the middle
8767 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8768 * the ones we care about.) There are four cases:
8769 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8770 * nothing left in the intersection.
8771 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8772 * above 2. What should be output is exactly that which is in the
8773 * non-exhausted set, as everything it has is also in the intersection
8774 * set, and everything it doesn't have can't be in the intersection
8775 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8776 * gets incremented to 2. Like the previous case, the intersection is
8777 * everything that remains in the non-exhausted set.
8778 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8779 * remains 1. And the intersection has nothing more. */
8780 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8781 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8786 /* The final length is what we've output so far plus what else is in the
8787 * intersection. At most one of the subexpressions below will be non-zero
8791 len_r += (len_a - i_a) + (len_b - i_b);
8794 /* Set result to final length, which can change the pointer to array_r, so
8796 if (len_r != _invlist_len(r)) {
8797 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8799 array_r = invlist_array(r);
8802 /* Finish outputting any remaining */
8803 if (count >= 2) { /* At most one will have a non-zero copy count */
8805 if ((copy_count = len_a - i_a) > 0) {
8806 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8808 else if ((copy_count = len_b - i_b) > 0) {
8809 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8813 /* We may be removing a reference to one of the inputs. If so, the output
8814 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8815 * count decremented) */
8816 if (a == *i || b == *i) {
8817 assert(! invlist_is_iterating(*i));
8822 SvREFCNT_dec_NN(*i);
8832 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8834 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8835 * set. A pointer to the inversion list is returned. This may actually be
8836 * a new list, in which case the passed in one has been destroyed. The
8837 * passed in inversion list can be NULL, in which case a new one is created
8838 * with just the one range in it */
8843 if (invlist == NULL) {
8844 invlist = _new_invlist(2);
8848 len = _invlist_len(invlist);
8851 /* If comes after the final entry actually in the list, can just append it
8854 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8855 && start >= invlist_array(invlist)[len - 1]))
8857 _append_range_to_invlist(invlist, start, end);
8861 /* Here, can't just append things, create and return a new inversion list
8862 * which is the union of this range and the existing inversion list */
8863 range_invlist = _new_invlist(2);
8864 _append_range_to_invlist(range_invlist, start, end);
8866 _invlist_union(invlist, range_invlist, &invlist);
8868 /* The temporary can be freed */
8869 SvREFCNT_dec_NN(range_invlist);
8875 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8876 UV** other_elements_ptr)
8878 /* Create and return an inversion list whose contents are to be populated
8879 * by the caller. The caller gives the number of elements (in 'size') and
8880 * the very first element ('element0'). This function will set
8881 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8884 * Obviously there is some trust involved that the caller will properly
8885 * fill in the other elements of the array.
8887 * (The first element needs to be passed in, as the underlying code does
8888 * things differently depending on whether it is zero or non-zero) */
8890 SV* invlist = _new_invlist(size);
8893 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8895 _append_range_to_invlist(invlist, element0, element0);
8896 offset = *get_invlist_offset_addr(invlist);
8898 invlist_set_len(invlist, size, offset);
8899 *other_elements_ptr = invlist_array(invlist) + 1;
8905 PERL_STATIC_INLINE SV*
8906 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8907 return _add_range_to_invlist(invlist, cp, cp);
8910 #ifndef PERL_IN_XSUB_RE
8912 Perl__invlist_invert(pTHX_ SV* const invlist)
8914 /* Complement the input inversion list. This adds a 0 if the list didn't
8915 * have a zero; removes it otherwise. As described above, the data
8916 * structure is set up so that this is very efficient */
8918 PERL_ARGS_ASSERT__INVLIST_INVERT;
8920 assert(! invlist_is_iterating(invlist));
8922 /* The inverse of matching nothing is matching everything */
8923 if (_invlist_len(invlist) == 0) {
8924 _append_range_to_invlist(invlist, 0, UV_MAX);
8928 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8933 PERL_STATIC_INLINE SV*
8934 S_invlist_clone(pTHX_ SV* const invlist)
8937 /* Return a new inversion list that is a copy of the input one, which is
8938 * unchanged. The new list will not be mortal even if the old one was. */
8940 /* Need to allocate extra space to accommodate Perl's addition of a
8941 * trailing NUL to SvPV's, since it thinks they are always strings */
8942 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8943 STRLEN physical_length = SvCUR(invlist);
8944 bool offset = *(get_invlist_offset_addr(invlist));
8946 PERL_ARGS_ASSERT_INVLIST_CLONE;
8948 *(get_invlist_offset_addr(new_invlist)) = offset;
8949 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8950 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8955 PERL_STATIC_INLINE STRLEN*
8956 S_get_invlist_iter_addr(SV* invlist)
8958 /* Return the address of the UV that contains the current iteration
8961 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8963 assert(SvTYPE(invlist) == SVt_INVLIST);
8965 return &(((XINVLIST*) SvANY(invlist))->iterator);
8968 PERL_STATIC_INLINE void
8969 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8971 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8973 *get_invlist_iter_addr(invlist) = 0;
8976 PERL_STATIC_INLINE void
8977 S_invlist_iterfinish(SV* invlist)
8979 /* Terminate iterator for invlist. This is to catch development errors.
8980 * Any iteration that is interrupted before completed should call this
8981 * function. Functions that add code points anywhere else but to the end
8982 * of an inversion list assert that they are not in the middle of an
8983 * iteration. If they were, the addition would make the iteration
8984 * problematical: if the iteration hadn't reached the place where things
8985 * were being added, it would be ok */
8987 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
8989 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
8993 S_invlist_iternext(SV* invlist, UV* start, UV* end)
8995 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
8996 * This call sets in <*start> and <*end>, the next range in <invlist>.
8997 * Returns <TRUE> if successful and the next call will return the next
8998 * range; <FALSE> if was already at the end of the list. If the latter,
8999 * <*start> and <*end> are unchanged, and the next call to this function
9000 * will start over at the beginning of the list */
9002 STRLEN* pos = get_invlist_iter_addr(invlist);
9003 UV len = _invlist_len(invlist);
9006 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9009 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9013 array = invlist_array(invlist);
9015 *start = array[(*pos)++];
9021 *end = array[(*pos)++] - 1;
9027 PERL_STATIC_INLINE bool
9028 S_invlist_is_iterating(SV* const invlist)
9030 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9032 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9035 PERL_STATIC_INLINE UV
9036 S_invlist_highest(SV* const invlist)
9038 /* Returns the highest code point that matches an inversion list. This API
9039 * has an ambiguity, as it returns 0 under either the highest is actually
9040 * 0, or if the list is empty. If this distinction matters to you, check
9041 * for emptiness before calling this function */
9043 UV len = _invlist_len(invlist);
9046 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9052 array = invlist_array(invlist);
9054 /* The last element in the array in the inversion list always starts a
9055 * range that goes to infinity. That range may be for code points that are
9056 * matched in the inversion list, or it may be for ones that aren't
9057 * matched. In the latter case, the highest code point in the set is one
9058 * less than the beginning of this range; otherwise it is the final element
9059 * of this range: infinity */
9060 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9062 : array[len - 1] - 1;
9065 #ifndef PERL_IN_XSUB_RE
9067 Perl__invlist_contents(pTHX_ SV* const invlist)
9069 /* Get the contents of an inversion list into a string SV so that they can
9070 * be printed out. It uses the format traditionally done for debug tracing
9074 SV* output = newSVpvs("\n");
9076 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9078 assert(! invlist_is_iterating(invlist));
9080 invlist_iterinit(invlist);
9081 while (invlist_iternext(invlist, &start, &end)) {
9082 if (end == UV_MAX) {
9083 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9085 else if (end != start) {
9086 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9090 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9098 #ifndef PERL_IN_XSUB_RE
9100 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9101 const char * const indent, SV* const invlist)
9103 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9104 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9105 * the string 'indent'. The output looks like this:
9106 [0] 0x000A .. 0x000D
9108 [4] 0x2028 .. 0x2029
9109 [6] 0x3104 .. INFINITY
9110 * This means that the first range of code points matched by the list are
9111 * 0xA through 0xD; the second range contains only the single code point
9112 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9113 * are used to define each range (except if the final range extends to
9114 * infinity, only a single element is needed). The array index of the
9115 * first element for the corresponding range is given in brackets. */
9120 PERL_ARGS_ASSERT__INVLIST_DUMP;
9122 if (invlist_is_iterating(invlist)) {
9123 Perl_dump_indent(aTHX_ level, file,
9124 "%sCan't dump inversion list because is in middle of iterating\n",
9129 invlist_iterinit(invlist);
9130 while (invlist_iternext(invlist, &start, &end)) {
9131 if (end == UV_MAX) {
9132 Perl_dump_indent(aTHX_ level, file,
9133 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9134 indent, (UV)count, start);
9136 else if (end != start) {
9137 Perl_dump_indent(aTHX_ level, file,
9138 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9139 indent, (UV)count, start, end);
9142 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9143 indent, (UV)count, start);
9150 Perl__load_PL_utf8_foldclosures (pTHX)
9152 assert(! PL_utf8_foldclosures);
9154 /* If the folds haven't been read in, call a fold function
9156 if (! PL_utf8_tofold) {
9157 U8 dummy[UTF8_MAXBYTES_CASE+1];
9159 /* This string is just a short named one above \xff */
9160 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9161 assert(PL_utf8_tofold); /* Verify that worked */
9163 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9167 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9169 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9171 /* Return a boolean as to if the two passed in inversion lists are
9172 * identical. The final argument, if TRUE, says to take the complement of
9173 * the second inversion list before doing the comparison */
9175 const UV* array_a = invlist_array(a);
9176 const UV* array_b = invlist_array(b);
9177 UV len_a = _invlist_len(a);
9178 UV len_b = _invlist_len(b);
9180 UV i = 0; /* current index into the arrays */
9181 bool retval = TRUE; /* Assume are identical until proven otherwise */
9183 PERL_ARGS_ASSERT__INVLISTEQ;
9185 /* If are to compare 'a' with the complement of b, set it
9186 * up so are looking at b's complement. */
9189 /* The complement of nothing is everything, so <a> would have to have
9190 * just one element, starting at zero (ending at infinity) */
9192 return (len_a == 1 && array_a[0] == 0);
9194 else if (array_b[0] == 0) {
9196 /* Otherwise, to complement, we invert. Here, the first element is
9197 * 0, just remove it. To do this, we just pretend the array starts
9205 /* But if the first element is not zero, we pretend the list starts
9206 * at the 0 that is always stored immediately before the array. */
9212 /* Make sure that the lengths are the same, as well as the final element
9213 * before looping through the remainder. (Thus we test the length, final,
9214 * and first elements right off the bat) */
9215 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9218 else for (i = 0; i < len_a - 1; i++) {
9219 if (array_a[i] != array_b[i]) {
9229 #undef HEADER_LENGTH
9230 #undef TO_INTERNAL_SIZE
9231 #undef FROM_INTERNAL_SIZE
9232 #undef INVLIST_VERSION_ID
9234 /* End of inversion list object */
9237 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9239 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9240 * constructs, and updates RExC_flags with them. On input, RExC_parse
9241 * should point to the first flag; it is updated on output to point to the
9242 * final ')' or ':'. There needs to be at least one flag, or this will
9245 /* for (?g), (?gc), and (?o) warnings; warning
9246 about (?c) will warn about (?g) -- japhy */
9248 #define WASTED_O 0x01
9249 #define WASTED_G 0x02
9250 #define WASTED_C 0x04
9251 #define WASTED_GC (WASTED_G|WASTED_C)
9252 I32 wastedflags = 0x00;
9253 U32 posflags = 0, negflags = 0;
9254 U32 *flagsp = &posflags;
9255 char has_charset_modifier = '\0';
9257 bool has_use_defaults = FALSE;
9258 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9260 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9262 /* '^' as an initial flag sets certain defaults */
9263 if (UCHARAT(RExC_parse) == '^') {
9265 has_use_defaults = TRUE;
9266 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9267 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9268 ? REGEX_UNICODE_CHARSET
9269 : REGEX_DEPENDS_CHARSET);
9272 cs = get_regex_charset(RExC_flags);
9273 if (cs == REGEX_DEPENDS_CHARSET
9274 && (RExC_utf8 || RExC_uni_semantics))
9276 cs = REGEX_UNICODE_CHARSET;
9279 while (*RExC_parse) {
9280 /* && strchr("iogcmsx", *RExC_parse) */
9281 /* (?g), (?gc) and (?o) are useless here
9282 and must be globally applied -- japhy */
9283 switch (*RExC_parse) {
9285 /* Code for the imsx flags */
9286 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9288 case LOCALE_PAT_MOD:
9289 if (has_charset_modifier) {
9290 goto excess_modifier;
9292 else if (flagsp == &negflags) {
9295 cs = REGEX_LOCALE_CHARSET;
9296 has_charset_modifier = LOCALE_PAT_MOD;
9298 case UNICODE_PAT_MOD:
9299 if (has_charset_modifier) {
9300 goto excess_modifier;
9302 else if (flagsp == &negflags) {
9305 cs = REGEX_UNICODE_CHARSET;
9306 has_charset_modifier = UNICODE_PAT_MOD;
9308 case ASCII_RESTRICT_PAT_MOD:
9309 if (flagsp == &negflags) {
9312 if (has_charset_modifier) {
9313 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9314 goto excess_modifier;
9316 /* Doubled modifier implies more restricted */
9317 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9320 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9322 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9324 case DEPENDS_PAT_MOD:
9325 if (has_use_defaults) {
9326 goto fail_modifiers;
9328 else if (flagsp == &negflags) {
9331 else if (has_charset_modifier) {
9332 goto excess_modifier;
9335 /* The dual charset means unicode semantics if the
9336 * pattern (or target, not known until runtime) are
9337 * utf8, or something in the pattern indicates unicode
9339 cs = (RExC_utf8 || RExC_uni_semantics)
9340 ? REGEX_UNICODE_CHARSET
9341 : REGEX_DEPENDS_CHARSET;
9342 has_charset_modifier = DEPENDS_PAT_MOD;
9346 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9347 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9349 else if (has_charset_modifier == *(RExC_parse - 1)) {
9350 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9354 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9359 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9362 case ONCE_PAT_MOD: /* 'o' */
9363 case GLOBAL_PAT_MOD: /* 'g' */
9364 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9365 const I32 wflagbit = *RExC_parse == 'o'
9368 if (! (wastedflags & wflagbit) ) {
9369 wastedflags |= wflagbit;
9370 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9373 "Useless (%s%c) - %suse /%c modifier",
9374 flagsp == &negflags ? "?-" : "?",
9376 flagsp == &negflags ? "don't " : "",
9383 case CONTINUE_PAT_MOD: /* 'c' */
9384 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
9385 if (! (wastedflags & WASTED_C) ) {
9386 wastedflags |= WASTED_GC;
9387 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9390 "Useless (%sc) - %suse /gc modifier",
9391 flagsp == &negflags ? "?-" : "?",
9392 flagsp == &negflags ? "don't " : ""
9397 case KEEPCOPY_PAT_MOD: /* 'p' */
9398 if (flagsp == &negflags) {
9400 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9402 *flagsp |= RXf_PMf_KEEPCOPY;
9406 /* A flag is a default iff it is following a minus, so
9407 * if there is a minus, it means will be trying to
9408 * re-specify a default which is an error */
9409 if (has_use_defaults || flagsp == &negflags) {
9410 goto fail_modifiers;
9413 wastedflags = 0; /* reset so (?g-c) warns twice */
9417 RExC_flags |= posflags;
9418 RExC_flags &= ~negflags;
9419 set_regex_charset(&RExC_flags, cs);
9420 if (RExC_flags & RXf_PMf_FOLD) {
9421 RExC_contains_i = 1;
9427 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9428 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9429 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9430 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9439 - reg - regular expression, i.e. main body or parenthesized thing
9441 * Caller must absorb opening parenthesis.
9443 * Combining parenthesis handling with the base level of regular expression
9444 * is a trifle forced, but the need to tie the tails of the branches to what
9445 * follows makes it hard to avoid.
9447 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9449 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9451 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9454 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9455 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9456 needs to be restarted.
9457 Otherwise would only return NULL if regbranch() returns NULL, which
9460 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9461 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9462 * 2 is like 1, but indicates that nextchar() has been called to advance
9463 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9464 * this flag alerts us to the need to check for that */
9467 regnode *ret; /* Will be the head of the group. */
9470 regnode *ender = NULL;
9473 U32 oregflags = RExC_flags;
9474 bool have_branch = 0;
9476 I32 freeze_paren = 0;
9477 I32 after_freeze = 0;
9478 I32 num; /* numeric backreferences */
9480 char * parse_start = RExC_parse; /* MJD */
9481 char * const oregcomp_parse = RExC_parse;
9483 GET_RE_DEBUG_FLAGS_DECL;
9485 PERL_ARGS_ASSERT_REG;
9486 DEBUG_PARSE("reg ");
9488 *flagp = 0; /* Tentatively. */
9491 /* Make an OPEN node, if parenthesized. */
9494 /* Under /x, space and comments can be gobbled up between the '(' and
9495 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9496 * intervening space, as the sequence is a token, and a token should be
9498 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9500 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9501 char *start_verb = RExC_parse;
9502 STRLEN verb_len = 0;
9503 char *start_arg = NULL;
9504 unsigned char op = 0;
9506 int internal_argval = 0; /* internal_argval is only useful if
9509 if (has_intervening_patws) {
9511 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9513 while ( *RExC_parse && *RExC_parse != ')' ) {
9514 if ( *RExC_parse == ':' ) {
9515 start_arg = RExC_parse + 1;
9521 verb_len = RExC_parse - start_verb;
9524 while ( *RExC_parse && *RExC_parse != ')' )
9526 if ( *RExC_parse != ')' )
9527 vFAIL("Unterminated verb pattern argument");
9528 if ( RExC_parse == start_arg )
9531 if ( *RExC_parse != ')' )
9532 vFAIL("Unterminated verb pattern");
9535 switch ( *start_verb ) {
9536 case 'A': /* (*ACCEPT) */
9537 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9539 internal_argval = RExC_nestroot;
9542 case 'C': /* (*COMMIT) */
9543 if ( memEQs(start_verb,verb_len,"COMMIT") )
9546 case 'F': /* (*FAIL) */
9547 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9552 case ':': /* (*:NAME) */
9553 case 'M': /* (*MARK:NAME) */
9554 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9559 case 'P': /* (*PRUNE) */
9560 if ( memEQs(start_verb,verb_len,"PRUNE") )
9563 case 'S': /* (*SKIP) */
9564 if ( memEQs(start_verb,verb_len,"SKIP") )
9567 case 'T': /* (*THEN) */
9568 /* [19:06] <TimToady> :: is then */
9569 if ( memEQs(start_verb,verb_len,"THEN") ) {
9571 RExC_seen |= REG_CUTGROUP_SEEN;
9576 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9578 "Unknown verb pattern '%"UTF8f"'",
9579 UTF8fARG(UTF, verb_len, start_verb));
9582 if ( start_arg && internal_argval ) {
9583 vFAIL3("Verb pattern '%.*s' may not have an argument",
9584 verb_len, start_verb);
9585 } else if ( argok < 0 && !start_arg ) {
9586 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9587 verb_len, start_verb);
9589 ret = reganode(pRExC_state, op, internal_argval);
9590 if ( ! internal_argval && ! SIZE_ONLY ) {
9592 SV *sv = newSVpvn( start_arg,
9593 RExC_parse - start_arg);
9594 ARG(ret) = add_data( pRExC_state,
9596 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9603 if (!internal_argval)
9604 RExC_seen |= REG_VERBARG_SEEN;
9605 } else if ( start_arg ) {
9606 vFAIL3("Verb pattern '%.*s' may not have an argument",
9607 verb_len, start_verb);
9609 ret = reg_node(pRExC_state, op);
9611 nextchar(pRExC_state);
9614 else if (*RExC_parse == '?') { /* (?...) */
9615 bool is_logical = 0;
9616 const char * const seqstart = RExC_parse;
9617 if (has_intervening_patws) {
9619 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9623 paren = *RExC_parse++;
9624 ret = NULL; /* For look-ahead/behind. */
9627 case 'P': /* (?P...) variants for those used to PCRE/Python */
9628 paren = *RExC_parse++;
9629 if ( paren == '<') /* (?P<...>) named capture */
9631 else if (paren == '>') { /* (?P>name) named recursion */
9632 goto named_recursion;
9634 else if (paren == '=') { /* (?P=...) named backref */
9635 /* this pretty much dupes the code for \k<NAME> in
9636 * regatom(), if you change this make sure you change that
9638 char* name_start = RExC_parse;
9640 SV *sv_dat = reg_scan_name(pRExC_state,
9641 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9642 if (RExC_parse == name_start || *RExC_parse != ')')
9643 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9644 vFAIL2("Sequence %.3s... not terminated",parse_start);
9647 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9648 RExC_rxi->data->data[num]=(void*)sv_dat;
9649 SvREFCNT_inc_simple_void(sv_dat);
9652 ret = reganode(pRExC_state,
9655 : (ASCII_FOLD_RESTRICTED)
9657 : (AT_LEAST_UNI_SEMANTICS)
9665 Set_Node_Offset(ret, parse_start+1);
9666 Set_Node_Cur_Length(ret, parse_start);
9668 nextchar(pRExC_state);
9672 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9673 vFAIL3("Sequence (%.*s...) not recognized",
9674 RExC_parse-seqstart, seqstart);
9676 case '<': /* (?<...) */
9677 if (*RExC_parse == '!')
9679 else if (*RExC_parse != '=')
9685 case '\'': /* (?'...') */
9686 name_start= RExC_parse;
9687 svname = reg_scan_name(pRExC_state,
9688 SIZE_ONLY /* reverse test from the others */
9689 ? REG_RSN_RETURN_NAME
9690 : REG_RSN_RETURN_NULL);
9691 if (RExC_parse == name_start || *RExC_parse != paren)
9692 vFAIL2("Sequence (?%c... not terminated",
9693 paren=='>' ? '<' : paren);
9697 if (!svname) /* shouldn't happen */
9699 "panic: reg_scan_name returned NULL");
9700 if (!RExC_paren_names) {
9701 RExC_paren_names= newHV();
9702 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9704 RExC_paren_name_list= newAV();
9705 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9708 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9710 sv_dat = HeVAL(he_str);
9712 /* croak baby croak */
9714 "panic: paren_name hash element allocation failed");
9715 } else if ( SvPOK(sv_dat) ) {
9716 /* (?|...) can mean we have dupes so scan to check
9717 its already been stored. Maybe a flag indicating
9718 we are inside such a construct would be useful,
9719 but the arrays are likely to be quite small, so
9720 for now we punt -- dmq */
9721 IV count = SvIV(sv_dat);
9722 I32 *pv = (I32*)SvPVX(sv_dat);
9724 for ( i = 0 ; i < count ; i++ ) {
9725 if ( pv[i] == RExC_npar ) {
9731 pv = (I32*)SvGROW(sv_dat,
9732 SvCUR(sv_dat) + sizeof(I32)+1);
9733 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9734 pv[count] = RExC_npar;
9735 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9738 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9739 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9742 SvIV_set(sv_dat, 1);
9745 /* Yes this does cause a memory leak in debugging Perls
9747 if (!av_store(RExC_paren_name_list,
9748 RExC_npar, SvREFCNT_inc(svname)))
9749 SvREFCNT_dec_NN(svname);
9752 /*sv_dump(sv_dat);*/
9754 nextchar(pRExC_state);
9756 goto capturing_parens;
9758 RExC_seen |= REG_LOOKBEHIND_SEEN;
9759 RExC_in_lookbehind++;
9762 case '=': /* (?=...) */
9763 RExC_seen_zerolen++;
9765 case '!': /* (?!...) */
9766 RExC_seen_zerolen++;
9767 if (*RExC_parse == ')') {
9768 ret=reg_node(pRExC_state, OPFAIL);
9769 nextchar(pRExC_state);
9773 case '|': /* (?|...) */
9774 /* branch reset, behave like a (?:...) except that
9775 buffers in alternations share the same numbers */
9777 after_freeze = freeze_paren = RExC_npar;
9779 case ':': /* (?:...) */
9780 case '>': /* (?>...) */
9782 case '$': /* (?$...) */
9783 case '@': /* (?@...) */
9784 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9786 case '0' : /* (?0) */
9787 case 'R' : /* (?R) */
9788 if (*RExC_parse != ')')
9789 FAIL("Sequence (?R) not terminated");
9790 ret = reg_node(pRExC_state, GOSTART);
9791 RExC_seen |= REG_GOSTART_SEEN;
9792 *flagp |= POSTPONED;
9793 nextchar(pRExC_state);
9796 /* named and numeric backreferences */
9797 case '&': /* (?&NAME) */
9798 parse_start = RExC_parse - 1;
9801 SV *sv_dat = reg_scan_name(pRExC_state,
9802 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9803 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9805 if (RExC_parse == RExC_end || *RExC_parse != ')')
9806 vFAIL("Sequence (?&... not terminated");
9807 goto gen_recurse_regop;
9808 assert(0); /* NOT REACHED */
9810 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9812 vFAIL("Illegal pattern");
9814 goto parse_recursion;
9816 case '-': /* (?-1) */
9817 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9818 RExC_parse--; /* rewind to let it be handled later */
9822 case '1': case '2': case '3': case '4': /* (?1) */
9823 case '5': case '6': case '7': case '8': case '9':
9826 num = atoi(RExC_parse);
9827 parse_start = RExC_parse - 1; /* MJD */
9828 if (*RExC_parse == '-')
9830 while (isDIGIT(*RExC_parse))
9832 if (*RExC_parse!=')')
9833 vFAIL("Expecting close bracket");
9836 if ( paren == '-' ) {
9838 Diagram of capture buffer numbering.
9839 Top line is the normal capture buffer numbers
9840 Bottom line is the negative indexing as from
9844 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9848 num = RExC_npar + num;
9851 vFAIL("Reference to nonexistent group");
9853 } else if ( paren == '+' ) {
9854 num = RExC_npar + num - 1;
9857 ret = reganode(pRExC_state, GOSUB, num);
9859 if (num > (I32)RExC_rx->nparens) {
9861 vFAIL("Reference to nonexistent group");
9863 ARG2L_SET( ret, RExC_recurse_count++);
9865 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9866 "Recurse #%"UVuf" to %"IVdf"\n",
9867 (UV)ARG(ret), (IV)ARG2L(ret)));
9871 RExC_seen |= REG_RECURSE_SEEN;
9872 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9873 Set_Node_Offset(ret, parse_start); /* MJD */
9875 *flagp |= POSTPONED;
9876 nextchar(pRExC_state);
9879 assert(0); /* NOT REACHED */
9881 case '?': /* (??...) */
9883 if (*RExC_parse != '{') {
9885 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9887 "Sequence (%"UTF8f"...) not recognized",
9888 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9891 *flagp |= POSTPONED;
9892 paren = *RExC_parse++;
9894 case '{': /* (?{...}) */
9897 struct reg_code_block *cb;
9899 RExC_seen_zerolen++;
9901 if ( !pRExC_state->num_code_blocks
9902 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9903 || pRExC_state->code_blocks[pRExC_state->code_index].start
9904 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9907 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9908 FAIL("panic: Sequence (?{...}): no code block found\n");
9909 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9911 /* this is a pre-compiled code block (?{...}) */
9912 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9913 RExC_parse = RExC_start + cb->end;
9916 if (cb->src_regex) {
9917 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9918 RExC_rxi->data->data[n] =
9919 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9920 RExC_rxi->data->data[n+1] = (void*)o;
9923 n = add_data(pRExC_state,
9924 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9925 RExC_rxi->data->data[n] = (void*)o;
9928 pRExC_state->code_index++;
9929 nextchar(pRExC_state);
9933 ret = reg_node(pRExC_state, LOGICAL);
9934 eval = reganode(pRExC_state, EVAL, n);
9937 /* for later propagation into (??{}) return value */
9938 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9940 REGTAIL(pRExC_state, ret, eval);
9941 /* deal with the length of this later - MJD */
9944 ret = reganode(pRExC_state, EVAL, n);
9945 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9946 Set_Node_Offset(ret, parse_start);
9949 case '(': /* (?(?{...})...) and (?(?=...)...) */
9952 if (RExC_parse[0] == '?') { /* (?(?...)) */
9953 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9954 || RExC_parse[1] == '<'
9955 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9959 ret = reg_node(pRExC_state, LOGICAL);
9963 tail = reg(pRExC_state, 1, &flag, depth+1);
9964 if (flag & RESTART_UTF8) {
9965 *flagp = RESTART_UTF8;
9968 REGTAIL(pRExC_state, ret, tail);
9972 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
9973 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
9975 char ch = RExC_parse[0] == '<' ? '>' : '\'';
9976 char *name_start= RExC_parse++;
9978 SV *sv_dat=reg_scan_name(pRExC_state,
9979 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9980 if (RExC_parse == name_start || *RExC_parse != ch)
9981 vFAIL2("Sequence (?(%c... not terminated",
9982 (ch == '>' ? '<' : ch));
9985 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9986 RExC_rxi->data->data[num]=(void*)sv_dat;
9987 SvREFCNT_inc_simple_void(sv_dat);
9989 ret = reganode(pRExC_state,NGROUPP,num);
9990 goto insert_if_check_paren;
9992 else if (RExC_parse[0] == 'D' &&
9993 RExC_parse[1] == 'E' &&
9994 RExC_parse[2] == 'F' &&
9995 RExC_parse[3] == 'I' &&
9996 RExC_parse[4] == 'N' &&
9997 RExC_parse[5] == 'E')
9999 ret = reganode(pRExC_state,DEFINEP,0);
10002 goto insert_if_check_paren;
10004 else if (RExC_parse[0] == 'R') {
10007 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10008 parno = atoi(RExC_parse++);
10009 while (isDIGIT(*RExC_parse))
10011 } else if (RExC_parse[0] == '&') {
10014 sv_dat = reg_scan_name(pRExC_state,
10016 ? REG_RSN_RETURN_NULL
10017 : REG_RSN_RETURN_DATA);
10018 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10020 ret = reganode(pRExC_state,INSUBP,parno);
10021 goto insert_if_check_paren;
10023 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10027 parno = atoi(RExC_parse++);
10029 while (isDIGIT(*RExC_parse))
10031 ret = reganode(pRExC_state, GROUPP, parno);
10033 insert_if_check_paren:
10034 if (*(tmp = nextchar(pRExC_state)) != ')') {
10035 /* nextchar also skips comments, so undo its work
10036 * and skip over the the next character.
10039 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10040 vFAIL("Switch condition not recognized");
10043 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10044 br = regbranch(pRExC_state, &flags, 1,depth+1);
10046 if (flags & RESTART_UTF8) {
10047 *flagp = RESTART_UTF8;
10050 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10053 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10055 c = *nextchar(pRExC_state);
10056 if (flags&HASWIDTH)
10057 *flagp |= HASWIDTH;
10060 vFAIL("(?(DEFINE)....) does not allow branches");
10062 /* Fake one for optimizer. */
10063 lastbr = reganode(pRExC_state, IFTHEN, 0);
10065 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10066 if (flags & RESTART_UTF8) {
10067 *flagp = RESTART_UTF8;
10070 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10073 REGTAIL(pRExC_state, ret, lastbr);
10074 if (flags&HASWIDTH)
10075 *flagp |= HASWIDTH;
10076 c = *nextchar(pRExC_state);
10081 vFAIL("Switch (?(condition)... contains too many branches");
10082 ender = reg_node(pRExC_state, TAIL);
10083 REGTAIL(pRExC_state, br, ender);
10085 REGTAIL(pRExC_state, lastbr, ender);
10086 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10089 REGTAIL(pRExC_state, ret, ender);
10090 RExC_size++; /* XXX WHY do we need this?!!
10091 For large programs it seems to be required
10092 but I can't figure out why. -- dmq*/
10096 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10097 vFAIL("Unknown switch condition (?(...))");
10100 case '[': /* (?[ ... ]) */
10101 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10104 RExC_parse--; /* for vFAIL to print correctly */
10105 vFAIL("Sequence (? incomplete");
10107 default: /* e.g., (?i) */
10110 parse_lparen_question_flags(pRExC_state);
10111 if (UCHARAT(RExC_parse) != ':') {
10112 nextchar(pRExC_state);
10117 nextchar(pRExC_state);
10127 ret = reganode(pRExC_state, OPEN, parno);
10129 if (!RExC_nestroot)
10130 RExC_nestroot = parno;
10131 if (RExC_seen & REG_RECURSE_SEEN
10132 && !RExC_open_parens[parno-1])
10134 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10135 "Setting open paren #%"IVdf" to %d\n",
10136 (IV)parno, REG_NODE_NUM(ret)));
10137 RExC_open_parens[parno-1]= ret;
10140 Set_Node_Length(ret, 1); /* MJD */
10141 Set_Node_Offset(ret, RExC_parse); /* MJD */
10149 /* Pick up the branches, linking them together. */
10150 parse_start = RExC_parse; /* MJD */
10151 br = regbranch(pRExC_state, &flags, 1,depth+1);
10153 /* branch_len = (paren != 0); */
10156 if (flags & RESTART_UTF8) {
10157 *flagp = RESTART_UTF8;
10160 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10162 if (*RExC_parse == '|') {
10163 if (!SIZE_ONLY && RExC_extralen) {
10164 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10167 reginsert(pRExC_state, BRANCH, br, depth+1);
10168 Set_Node_Length(br, paren != 0);
10169 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10173 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10175 else if (paren == ':') {
10176 *flagp |= flags&SIMPLE;
10178 if (is_open) { /* Starts with OPEN. */
10179 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10181 else if (paren != '?') /* Not Conditional */
10183 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10185 while (*RExC_parse == '|') {
10186 if (!SIZE_ONLY && RExC_extralen) {
10187 ender = reganode(pRExC_state, LONGJMP,0);
10189 /* Append to the previous. */
10190 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10193 RExC_extralen += 2; /* Account for LONGJMP. */
10194 nextchar(pRExC_state);
10195 if (freeze_paren) {
10196 if (RExC_npar > after_freeze)
10197 after_freeze = RExC_npar;
10198 RExC_npar = freeze_paren;
10200 br = regbranch(pRExC_state, &flags, 0, depth+1);
10203 if (flags & RESTART_UTF8) {
10204 *flagp = RESTART_UTF8;
10207 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10209 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10211 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10214 if (have_branch || paren != ':') {
10215 /* Make a closing node, and hook it on the end. */
10218 ender = reg_node(pRExC_state, TAIL);
10221 ender = reganode(pRExC_state, CLOSE, parno);
10222 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10223 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10224 "Setting close paren #%"IVdf" to %d\n",
10225 (IV)parno, REG_NODE_NUM(ender)));
10226 RExC_close_parens[parno-1]= ender;
10227 if (RExC_nestroot == parno)
10230 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10231 Set_Node_Length(ender,1); /* MJD */
10237 *flagp &= ~HASWIDTH;
10240 ender = reg_node(pRExC_state, SUCCEED);
10243 ender = reg_node(pRExC_state, END);
10245 assert(!RExC_opend); /* there can only be one! */
10246 RExC_opend = ender;
10250 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10251 SV * const mysv_val1=sv_newmortal();
10252 SV * const mysv_val2=sv_newmortal();
10253 DEBUG_PARSE_MSG("lsbr");
10254 regprop(RExC_rx, mysv_val1, lastbr, NULL);
10255 regprop(RExC_rx, mysv_val2, ender, NULL);
10256 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10257 SvPV_nolen_const(mysv_val1),
10258 (IV)REG_NODE_NUM(lastbr),
10259 SvPV_nolen_const(mysv_val2),
10260 (IV)REG_NODE_NUM(ender),
10261 (IV)(ender - lastbr)
10264 REGTAIL(pRExC_state, lastbr, ender);
10266 if (have_branch && !SIZE_ONLY) {
10267 char is_nothing= 1;
10269 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10271 /* Hook the tails of the branches to the closing node. */
10272 for (br = ret; br; br = regnext(br)) {
10273 const U8 op = PL_regkind[OP(br)];
10274 if (op == BRANCH) {
10275 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10276 if ( OP(NEXTOPER(br)) != NOTHING
10277 || regnext(NEXTOPER(br)) != ender)
10280 else if (op == BRANCHJ) {
10281 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10282 /* for now we always disable this optimisation * /
10283 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10284 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10290 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10291 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10292 SV * const mysv_val1=sv_newmortal();
10293 SV * const mysv_val2=sv_newmortal();
10294 DEBUG_PARSE_MSG("NADA");
10295 regprop(RExC_rx, mysv_val1, ret, NULL);
10296 regprop(RExC_rx, mysv_val2, ender, NULL);
10297 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10298 SvPV_nolen_const(mysv_val1),
10299 (IV)REG_NODE_NUM(ret),
10300 SvPV_nolen_const(mysv_val2),
10301 (IV)REG_NODE_NUM(ender),
10306 if (OP(ender) == TAIL) {
10311 for ( opt= br + 1; opt < ender ; opt++ )
10312 OP(opt)= OPTIMIZED;
10313 NEXT_OFF(br)= ender - br;
10321 static const char parens[] = "=!<,>";
10323 if (paren && (p = strchr(parens, paren))) {
10324 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10325 int flag = (p - parens) > 1;
10328 node = SUSPEND, flag = 0;
10329 reginsert(pRExC_state, node,ret, depth+1);
10330 Set_Node_Cur_Length(ret, parse_start);
10331 Set_Node_Offset(ret, parse_start + 1);
10333 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10337 /* Check for proper termination. */
10339 /* restore original flags, but keep (?p) */
10340 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10341 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10342 RExC_parse = oregcomp_parse;
10343 vFAIL("Unmatched (");
10346 else if (!paren && RExC_parse < RExC_end) {
10347 if (*RExC_parse == ')') {
10349 vFAIL("Unmatched )");
10352 FAIL("Junk on end of regexp"); /* "Can't happen". */
10353 assert(0); /* NOTREACHED */
10356 if (RExC_in_lookbehind) {
10357 RExC_in_lookbehind--;
10359 if (after_freeze > RExC_npar)
10360 RExC_npar = after_freeze;
10365 - regbranch - one alternative of an | operator
10367 * Implements the concatenation operator.
10369 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10373 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10377 regnode *chain = NULL;
10379 I32 flags = 0, c = 0;
10380 GET_RE_DEBUG_FLAGS_DECL;
10382 PERL_ARGS_ASSERT_REGBRANCH;
10384 DEBUG_PARSE("brnc");
10389 if (!SIZE_ONLY && RExC_extralen)
10390 ret = reganode(pRExC_state, BRANCHJ,0);
10392 ret = reg_node(pRExC_state, BRANCH);
10393 Set_Node_Length(ret, 1);
10397 if (!first && SIZE_ONLY)
10398 RExC_extralen += 1; /* BRANCHJ */
10400 *flagp = WORST; /* Tentatively. */
10403 nextchar(pRExC_state);
10404 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10405 flags &= ~TRYAGAIN;
10406 latest = regpiece(pRExC_state, &flags,depth+1);
10407 if (latest == NULL) {
10408 if (flags & TRYAGAIN)
10410 if (flags & RESTART_UTF8) {
10411 *flagp = RESTART_UTF8;
10414 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10416 else if (ret == NULL)
10418 *flagp |= flags&(HASWIDTH|POSTPONED);
10419 if (chain == NULL) /* First piece. */
10420 *flagp |= flags&SPSTART;
10423 REGTAIL(pRExC_state, chain, latest);
10428 if (chain == NULL) { /* Loop ran zero times. */
10429 chain = reg_node(pRExC_state, NOTHING);
10434 *flagp |= flags&SIMPLE;
10441 - regpiece - something followed by possible [*+?]
10443 * Note that the branching code sequences used for ? and the general cases
10444 * of * and + are somewhat optimized: they use the same NOTHING node as
10445 * both the endmarker for their branch list and the body of the last branch.
10446 * It might seem that this node could be dispensed with entirely, but the
10447 * endmarker role is not redundant.
10449 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10451 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10455 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10462 const char * const origparse = RExC_parse;
10464 I32 max = REG_INFTY;
10465 #ifdef RE_TRACK_PATTERN_OFFSETS
10468 const char *maxpos = NULL;
10470 /* Save the original in case we change the emitted regop to a FAIL. */
10471 regnode * const orig_emit = RExC_emit;
10473 GET_RE_DEBUG_FLAGS_DECL;
10475 PERL_ARGS_ASSERT_REGPIECE;
10477 DEBUG_PARSE("piec");
10479 ret = regatom(pRExC_state, &flags,depth+1);
10481 if (flags & (TRYAGAIN|RESTART_UTF8))
10482 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10484 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10490 if (op == '{' && regcurly(RExC_parse)) {
10492 #ifdef RE_TRACK_PATTERN_OFFSETS
10493 parse_start = RExC_parse; /* MJD */
10495 next = RExC_parse + 1;
10496 while (isDIGIT(*next) || *next == ',') {
10497 if (*next == ',') {
10505 if (*next == '}') { /* got one */
10509 min = atoi(RExC_parse);
10510 if (*maxpos == ',')
10513 maxpos = RExC_parse;
10514 max = atoi(maxpos);
10515 if (!max && *maxpos != '0')
10516 max = REG_INFTY; /* meaning "infinity" */
10517 else if (max >= REG_INFTY)
10518 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10520 nextchar(pRExC_state);
10521 if (max < min) { /* If can't match, warn and optimize to fail
10524 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10526 /* We can't back off the size because we have to reserve
10527 * enough space for all the things we are about to throw
10528 * away, but we can shrink it by the ammount we are about
10529 * to re-use here */
10530 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10533 RExC_emit = orig_emit;
10535 ret = reg_node(pRExC_state, OPFAIL);
10538 else if (min == max
10539 && RExC_parse < RExC_end
10540 && (*RExC_parse == '?' || *RExC_parse == '+'))
10543 ckWARN2reg(RExC_parse + 1,
10544 "Useless use of greediness modifier '%c'",
10547 /* Absorb the modifier, so later code doesn't see nor use
10549 nextchar(pRExC_state);
10553 if ((flags&SIMPLE)) {
10554 RExC_naughty += 2 + RExC_naughty / 2;
10555 reginsert(pRExC_state, CURLY, ret, depth+1);
10556 Set_Node_Offset(ret, parse_start+1); /* MJD */
10557 Set_Node_Cur_Length(ret, parse_start);
10560 regnode * const w = reg_node(pRExC_state, WHILEM);
10563 REGTAIL(pRExC_state, ret, w);
10564 if (!SIZE_ONLY && RExC_extralen) {
10565 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10566 reginsert(pRExC_state, NOTHING,ret, depth+1);
10567 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10569 reginsert(pRExC_state, CURLYX,ret, depth+1);
10571 Set_Node_Offset(ret, parse_start+1);
10572 Set_Node_Length(ret,
10573 op == '{' ? (RExC_parse - parse_start) : 1);
10575 if (!SIZE_ONLY && RExC_extralen)
10576 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10577 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10579 RExC_whilem_seen++, RExC_extralen += 3;
10580 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10587 *flagp |= HASWIDTH;
10589 ARG1_SET(ret, (U16)min);
10590 ARG2_SET(ret, (U16)max);
10592 if (max == REG_INFTY)
10593 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10599 if (!ISMULT1(op)) {
10604 #if 0 /* Now runtime fix should be reliable. */
10606 /* if this is reinstated, don't forget to put this back into perldiag:
10608 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10610 (F) The part of the regexp subject to either the * or + quantifier
10611 could match an empty string. The {#} shows in the regular
10612 expression about where the problem was discovered.
10616 if (!(flags&HASWIDTH) && op != '?')
10617 vFAIL("Regexp *+ operand could be empty");
10620 #ifdef RE_TRACK_PATTERN_OFFSETS
10621 parse_start = RExC_parse;
10623 nextchar(pRExC_state);
10625 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10627 if (op == '*' && (flags&SIMPLE)) {
10628 reginsert(pRExC_state, STAR, ret, depth+1);
10631 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10633 else if (op == '*') {
10637 else if (op == '+' && (flags&SIMPLE)) {
10638 reginsert(pRExC_state, PLUS, ret, depth+1);
10641 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10643 else if (op == '+') {
10647 else if (op == '?') {
10652 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10653 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10654 ckWARN2reg(RExC_parse,
10655 "%"UTF8f" matches null string many times",
10656 UTF8fARG(UTF, (RExC_parse >= origparse
10657 ? RExC_parse - origparse
10660 (void)ReREFCNT_inc(RExC_rx_sv);
10663 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10664 nextchar(pRExC_state);
10665 reginsert(pRExC_state, MINMOD, ret, depth+1);
10666 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10669 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10671 nextchar(pRExC_state);
10672 ender = reg_node(pRExC_state, SUCCEED);
10673 REGTAIL(pRExC_state, ret, ender);
10674 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10676 ender = reg_node(pRExC_state, TAIL);
10677 REGTAIL(pRExC_state, ret, ender);
10680 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10682 vFAIL("Nested quantifiers");
10689 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10690 UV *valuep, I32 *flagp, U32 depth, bool in_char_class,
10691 const bool strict /* Apply stricter parsing rules? */
10695 /* This is expected to be called by a parser routine that has recognized '\N'
10696 and needs to handle the rest. RExC_parse is expected to point at the first
10697 char following the N at the time of the call. On successful return,
10698 RExC_parse has been updated to point to just after the sequence identified
10699 by this routine, and <*flagp> has been updated.
10701 The \N may be inside (indicated by the boolean <in_char_class>) or outside a
10704 \N may begin either a named sequence, or if outside a character class, mean
10705 to match a non-newline. For non single-quoted regexes, the tokenizer has
10706 attempted to decide which, and in the case of a named sequence, converted it
10707 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10708 where c1... are the characters in the sequence. For single-quoted regexes,
10709 the tokenizer passes the \N sequence through unchanged; this code will not
10710 attempt to determine this nor expand those, instead raising a syntax error.
10711 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10712 or there is no '}', it signals that this \N occurrence means to match a
10715 Only the \N{U+...} form should occur in a character class, for the same
10716 reason that '.' inside a character class means to just match a period: it
10717 just doesn't make sense.
10719 The function raises an error (via vFAIL), and doesn't return for various
10720 syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
10721 success; it returns FALSE otherwise. Returns FALSE, setting *flagp to
10722 RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is
10723 only possible if node_p is non-NULL.
10726 If <valuep> is non-null, it means the caller can accept an input sequence
10727 consisting of a just a single code point; <*valuep> is set to that value
10728 if the input is such.
10730 If <node_p> is non-null it signifies that the caller can accept any other
10731 legal sequence (i.e., one that isn't just a single code point). <*node_p>
10733 1) \N means not-a-NL: points to a newly created REG_ANY node;
10734 2) \N{}: points to a new NOTHING node;
10735 3) otherwise: points to a new EXACT node containing the resolved
10737 Note that FALSE is returned for single code point sequences if <valuep> is
10741 char * endbrace; /* '}' following the name */
10743 char *endchar; /* Points to '.' or '}' ending cur char in the input
10745 bool has_multiple_chars; /* true if the input stream contains a sequence of
10746 more than one character */
10748 GET_RE_DEBUG_FLAGS_DECL;
10750 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10752 GET_RE_DEBUG_FLAGS;
10754 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10756 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10757 * modifier. The other meaning does not, so use a temporary until we find
10758 * out which we are being called with */
10759 p = (RExC_flags & RXf_PMf_EXTENDED)
10760 ? regpatws(pRExC_state, RExC_parse,
10761 TRUE) /* means recognize comments */
10764 /* Disambiguate between \N meaning a named character versus \N meaning
10765 * [^\n]. The former is assumed when it can't be the latter. */
10766 if (*p != '{' || regcurly(p)) {
10769 /* no bare \N allowed in a charclass */
10770 if (in_char_class) {
10771 vFAIL("\\N in a character class must be a named character: \\N{...}");
10775 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10777 nextchar(pRExC_state);
10778 *node_p = reg_node(pRExC_state, REG_ANY);
10779 *flagp |= HASWIDTH|SIMPLE;
10781 Set_Node_Length(*node_p, 1); /* MJD */
10785 /* Here, we have decided it should be a named character or sequence */
10787 /* The test above made sure that the next real character is a '{', but
10788 * under the /x modifier, it could be separated by space (or a comment and
10789 * \n) and this is not allowed (for consistency with \x{...} and the
10790 * tokenizer handling of \N{NAME}). */
10791 if (*RExC_parse != '{') {
10792 vFAIL("Missing braces on \\N{}");
10795 RExC_parse++; /* Skip past the '{' */
10797 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10798 || ! (endbrace == RExC_parse /* nothing between the {} */
10799 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10801 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10804 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10805 vFAIL("\\N{NAME} must be resolved by the lexer");
10808 if (endbrace == RExC_parse) { /* empty: \N{} */
10811 *node_p = reg_node(pRExC_state,NOTHING);
10813 else if (in_char_class) {
10814 if (SIZE_ONLY && in_char_class) {
10816 RExC_parse++; /* Position after the "}" */
10817 vFAIL("Zero length \\N{}");
10820 ckWARNreg(RExC_parse,
10821 "Ignoring zero length \\N{} in character class");
10829 nextchar(pRExC_state);
10833 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10834 RExC_parse += 2; /* Skip past the 'U+' */
10836 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10838 /* Code points are separated by dots. If none, there is only one code
10839 * point, and is terminated by the brace */
10840 has_multiple_chars = (endchar < endbrace);
10842 if (valuep && (! has_multiple_chars || in_char_class)) {
10843 /* We only pay attention to the first char of
10844 multichar strings being returned in char classes. I kinda wonder
10845 if this makes sense as it does change the behaviour
10846 from earlier versions, OTOH that behaviour was broken
10847 as well. XXX Solution is to recharacterize as
10848 [rest-of-class]|multi1|multi2... */
10850 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10851 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10852 | PERL_SCAN_DISALLOW_PREFIX
10853 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
10855 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10857 /* The tokenizer should have guaranteed validity, but it's possible to
10858 * bypass it by using single quoting, so check */
10859 if (length_of_hex == 0
10860 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10862 RExC_parse += length_of_hex; /* Includes all the valid */
10863 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10864 ? UTF8SKIP(RExC_parse)
10866 /* Guard against malformed utf8 */
10867 if (RExC_parse >= endchar) {
10868 RExC_parse = endchar;
10870 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10873 if (in_char_class && has_multiple_chars) {
10875 RExC_parse = endbrace;
10876 vFAIL("\\N{} in character class restricted to one character");
10879 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
10883 RExC_parse = endbrace + 1;
10885 else if (! node_p || ! has_multiple_chars) {
10887 /* Here, the input is legal, but not according to the caller's
10888 * options. We fail without advancing the parse, so that the
10889 * caller can try again */
10895 /* What is done here is to convert this to a sub-pattern of the form
10896 * (?:\x{char1}\x{char2}...)
10897 * and then call reg recursively. That way, it retains its atomicness,
10898 * while not having to worry about special handling that some code
10899 * points may have. toke.c has converted the original Unicode values
10900 * to native, so that we can just pass on the hex values unchanged. We
10901 * do have to set a flag to keep recoding from happening in the
10904 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
10906 char *orig_end = RExC_end;
10909 while (RExC_parse < endbrace) {
10911 /* Convert to notation the rest of the code understands */
10912 sv_catpv(substitute_parse, "\\x{");
10913 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
10914 sv_catpv(substitute_parse, "}");
10916 /* Point to the beginning of the next character in the sequence. */
10917 RExC_parse = endchar + 1;
10918 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10920 sv_catpv(substitute_parse, ")");
10922 RExC_parse = SvPV(substitute_parse, len);
10924 /* Don't allow empty number */
10926 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10928 RExC_end = RExC_parse + len;
10930 /* The values are Unicode, and therefore not subject to recoding */
10931 RExC_override_recoding = 1;
10933 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10934 if (flags & RESTART_UTF8) {
10935 *flagp = RESTART_UTF8;
10938 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
10941 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
10943 RExC_parse = endbrace;
10944 RExC_end = orig_end;
10945 RExC_override_recoding = 0;
10947 nextchar(pRExC_state);
10957 * It returns the code point in utf8 for the value in *encp.
10958 * value: a code value in the source encoding
10959 * encp: a pointer to an Encode object
10961 * If the result from Encode is not a single character,
10962 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
10965 S_reg_recode(pTHX_ const char value, SV **encp)
10968 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
10969 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
10970 const STRLEN newlen = SvCUR(sv);
10971 UV uv = UNICODE_REPLACEMENT;
10973 PERL_ARGS_ASSERT_REG_RECODE;
10977 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
10980 if (!newlen || numlen != newlen) {
10981 uv = UNICODE_REPLACEMENT;
10987 PERL_STATIC_INLINE U8
10988 S_compute_EXACTish(RExC_state_t *pRExC_state)
10992 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
10998 op = get_regex_charset(RExC_flags);
10999 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11000 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11001 been, so there is no hole */
11004 return op + EXACTF;
11007 PERL_STATIC_INLINE void
11008 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11009 regnode *node, I32* flagp, STRLEN len, UV code_point,
11012 /* This knows the details about sizing an EXACTish node, setting flags for
11013 * it (by setting <*flagp>, and potentially populating it with a single
11016 * If <len> (the length in bytes) is non-zero, this function assumes that
11017 * the node has already been populated, and just does the sizing. In this
11018 * case <code_point> should be the final code point that has already been
11019 * placed into the node. This value will be ignored except that under some
11020 * circumstances <*flagp> is set based on it.
11022 * If <len> is zero, the function assumes that the node is to contain only
11023 * the single character given by <code_point> and calculates what <len>
11024 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11025 * additionally will populate the node's STRING with <code_point> or its
11028 * In both cases <*flagp> is appropriately set
11030 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11031 * 255, must be folded (the former only when the rules indicate it can
11034 * When it does the populating, it looks at the flag 'downgradable'. If
11035 * true with a node that folds, it checks if the single code point
11036 * participates in a fold, and if not downgrades the node to an EXACT.
11037 * This helps the optimizer */
11039 bool len_passed_in = cBOOL(len != 0);
11040 U8 character[UTF8_MAXBYTES_CASE+1];
11042 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11044 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11045 * sizing difference, and is extra work that is thrown away */
11046 if (downgradable && ! PASS2) {
11047 downgradable = FALSE;
11050 if (! len_passed_in) {
11052 if (UNI_IS_INVARIANT(code_point)) {
11053 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11054 *character = (U8) code_point;
11056 else { /* Here is /i and not /l (toFOLD() is defined on just
11057 ASCII, which isn't the same thing as INVARIANT on
11058 EBCDIC, but it works there, as the extra invariants
11059 fold to themselves) */
11060 *character = toFOLD((U8) code_point);
11062 && *character == code_point
11063 && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
11070 else if (FOLD && (! LOC
11071 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11072 { /* Folding, and ok to do so now */
11073 UV folded = _to_uni_fold_flags(
11077 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11078 ? FOLD_FLAGS_NOMIX_ASCII
11081 && folded == code_point
11082 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11087 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11089 /* Not folding this cp, and can output it directly */
11090 *character = UTF8_TWO_BYTE_HI(code_point);
11091 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11095 uvchr_to_utf8( character, code_point);
11096 len = UTF8SKIP(character);
11098 } /* Else pattern isn't UTF8. */
11100 *character = (U8) code_point;
11102 } /* Else is folded non-UTF8 */
11103 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11105 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11106 * comments at join_exact()); */
11107 *character = (U8) code_point;
11110 /* Can turn into an EXACT node if we know the fold at compile time,
11111 * and it folds to itself and doesn't particpate in other folds */
11114 && PL_fold_latin1[code_point] == code_point
11115 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11116 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11120 } /* else is Sharp s. May need to fold it */
11121 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11123 *(character + 1) = 's';
11127 *character = LATIN_SMALL_LETTER_SHARP_S;
11133 RExC_size += STR_SZ(len);
11136 RExC_emit += STR_SZ(len);
11137 STR_LEN(node) = len;
11138 if (! len_passed_in) {
11139 Copy((char *) character, STRING(node), len, char);
11143 *flagp |= HASWIDTH;
11145 /* A single character node is SIMPLE, except for the special-cased SHARP S
11147 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11148 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11149 || ! FOLD || ! DEPENDS_SEMANTICS))
11154 /* The OP may not be well defined in PASS1 */
11155 if (PASS2 && OP(node) == EXACTFL) {
11156 RExC_contains_locale = 1;
11161 /* return atoi(p), unless it's too big to sensibly be a backref,
11162 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11165 S_backref_value(char *p)
11169 for (;isDIGIT(*q); q++) {} /* calculate length of num */
11170 if (q - p == 0 || q - p > 9)
11177 - regatom - the lowest level
11179 Try to identify anything special at the start of the pattern. If there
11180 is, then handle it as required. This may involve generating a single regop,
11181 such as for an assertion; or it may involve recursing, such as to
11182 handle a () structure.
11184 If the string doesn't start with something special then we gobble up
11185 as much literal text as we can.
11187 Once we have been able to handle whatever type of thing started the
11188 sequence, we return.
11190 Note: we have to be careful with escapes, as they can be both literal
11191 and special, and in the case of \10 and friends, context determines which.
11193 A summary of the code structure is:
11195 switch (first_byte) {
11196 cases for each special:
11197 handle this special;
11200 switch (2nd byte) {
11201 cases for each unambiguous special:
11202 handle this special;
11204 cases for each ambigous special/literal:
11206 if (special) handle here
11208 default: // unambiguously literal:
11211 default: // is a literal char
11214 create EXACTish node for literal;
11215 while (more input and node isn't full) {
11216 switch (input_byte) {
11217 cases for each special;
11218 make sure parse pointer is set so that the next call to
11219 regatom will see this special first
11220 goto loopdone; // EXACTish node terminated by prev. char
11222 append char to EXACTISH node;
11224 get next input byte;
11228 return the generated node;
11230 Specifically there are two separate switches for handling
11231 escape sequences, with the one for handling literal escapes requiring
11232 a dummy entry for all of the special escapes that are actually handled
11235 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11237 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11239 Otherwise does not return NULL.
11243 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11246 regnode *ret = NULL;
11248 char *parse_start = RExC_parse;
11253 GET_RE_DEBUG_FLAGS_DECL;
11255 *flagp = WORST; /* Tentatively. */
11257 DEBUG_PARSE("atom");
11259 PERL_ARGS_ASSERT_REGATOM;
11262 switch ((U8)*RExC_parse) {
11264 RExC_seen_zerolen++;
11265 nextchar(pRExC_state);
11266 if (RExC_flags & RXf_PMf_MULTILINE)
11267 ret = reg_node(pRExC_state, MBOL);
11268 else if (RExC_flags & RXf_PMf_SINGLELINE)
11269 ret = reg_node(pRExC_state, SBOL);
11271 ret = reg_node(pRExC_state, BOL);
11272 Set_Node_Length(ret, 1); /* MJD */
11275 nextchar(pRExC_state);
11277 RExC_seen_zerolen++;
11278 if (RExC_flags & RXf_PMf_MULTILINE)
11279 ret = reg_node(pRExC_state, MEOL);
11280 else if (RExC_flags & RXf_PMf_SINGLELINE)
11281 ret = reg_node(pRExC_state, SEOL);
11283 ret = reg_node(pRExC_state, EOL);
11284 Set_Node_Length(ret, 1); /* MJD */
11287 nextchar(pRExC_state);
11288 if (RExC_flags & RXf_PMf_SINGLELINE)
11289 ret = reg_node(pRExC_state, SANY);
11291 ret = reg_node(pRExC_state, REG_ANY);
11292 *flagp |= HASWIDTH|SIMPLE;
11294 Set_Node_Length(ret, 1); /* MJD */
11298 char * const oregcomp_parse = ++RExC_parse;
11299 ret = regclass(pRExC_state, flagp,depth+1,
11300 FALSE, /* means parse the whole char class */
11301 TRUE, /* allow multi-char folds */
11302 FALSE, /* don't silence non-portable warnings. */
11304 if (*RExC_parse != ']') {
11305 RExC_parse = oregcomp_parse;
11306 vFAIL("Unmatched [");
11309 if (*flagp & RESTART_UTF8)
11311 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11314 nextchar(pRExC_state);
11315 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11319 nextchar(pRExC_state);
11320 ret = reg(pRExC_state, 2, &flags,depth+1);
11322 if (flags & TRYAGAIN) {
11323 if (RExC_parse == RExC_end) {
11324 /* Make parent create an empty node if needed. */
11325 *flagp |= TRYAGAIN;
11330 if (flags & RESTART_UTF8) {
11331 *flagp = RESTART_UTF8;
11334 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11337 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11341 if (flags & TRYAGAIN) {
11342 *flagp |= TRYAGAIN;
11345 vFAIL("Internal urp");
11346 /* Supposed to be caught earlier. */
11352 vFAIL("Quantifier follows nothing");
11357 This switch handles escape sequences that resolve to some kind
11358 of special regop and not to literal text. Escape sequnces that
11359 resolve to literal text are handled below in the switch marked
11362 Every entry in this switch *must* have a corresponding entry
11363 in the literal escape switch. However, the opposite is not
11364 required, as the default for this switch is to jump to the
11365 literal text handling code.
11367 switch ((U8)*++RExC_parse) {
11368 /* Special Escapes */
11370 RExC_seen_zerolen++;
11371 ret = reg_node(pRExC_state, SBOL);
11373 goto finish_meta_pat;
11375 ret = reg_node(pRExC_state, GPOS);
11376 RExC_seen |= REG_GPOS_SEEN;
11378 goto finish_meta_pat;
11380 RExC_seen_zerolen++;
11381 ret = reg_node(pRExC_state, KEEPS);
11383 /* XXX:dmq : disabling in-place substitution seems to
11384 * be necessary here to avoid cases of memory corruption, as
11385 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11387 RExC_seen |= REG_LOOKBEHIND_SEEN;
11388 goto finish_meta_pat;
11390 ret = reg_node(pRExC_state, SEOL);
11392 RExC_seen_zerolen++; /* Do not optimize RE away */
11393 goto finish_meta_pat;
11395 ret = reg_node(pRExC_state, EOS);
11397 RExC_seen_zerolen++; /* Do not optimize RE away */
11398 goto finish_meta_pat;
11400 ret = reg_node(pRExC_state, CANY);
11401 RExC_seen |= REG_CANY_SEEN;
11402 *flagp |= HASWIDTH|SIMPLE;
11403 goto finish_meta_pat;
11405 ret = reg_node(pRExC_state, CLUMP);
11406 *flagp |= HASWIDTH;
11407 goto finish_meta_pat;
11413 arg = ANYOF_WORDCHAR;
11417 RExC_seen_zerolen++;
11418 RExC_seen |= REG_LOOKBEHIND_SEEN;
11419 op = BOUND + get_regex_charset(RExC_flags);
11420 if (op > BOUNDA) { /* /aa is same as /a */
11423 else if (op == BOUNDL) {
11424 RExC_contains_locale = 1;
11426 ret = reg_node(pRExC_state, op);
11427 FLAGS(ret) = get_regex_charset(RExC_flags);
11429 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11430 /* diag_listed_as: Use "%s" instead of "%s" */
11431 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11433 goto finish_meta_pat;
11435 RExC_seen_zerolen++;
11436 RExC_seen |= REG_LOOKBEHIND_SEEN;
11437 op = NBOUND + get_regex_charset(RExC_flags);
11438 if (op > NBOUNDA) { /* /aa is same as /a */
11441 else if (op == NBOUNDL) {
11442 RExC_contains_locale = 1;
11444 ret = reg_node(pRExC_state, op);
11445 FLAGS(ret) = get_regex_charset(RExC_flags);
11447 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
11448 /* diag_listed_as: Use "%s" instead of "%s" */
11449 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11451 goto finish_meta_pat;
11461 ret = reg_node(pRExC_state, LNBREAK);
11462 *flagp |= HASWIDTH|SIMPLE;
11463 goto finish_meta_pat;
11471 goto join_posix_op_known;
11477 arg = ANYOF_VERTWS;
11479 goto join_posix_op_known;
11489 op = POSIXD + get_regex_charset(RExC_flags);
11490 if (op > POSIXA) { /* /aa is same as /a */
11493 else if (op == POSIXL) {
11494 RExC_contains_locale = 1;
11497 join_posix_op_known:
11500 op += NPOSIXD - POSIXD;
11503 ret = reg_node(pRExC_state, op);
11505 FLAGS(ret) = namedclass_to_classnum(arg);
11508 *flagp |= HASWIDTH|SIMPLE;
11512 nextchar(pRExC_state);
11513 Set_Node_Length(ret, 2); /* MJD */
11519 char* parse_start = RExC_parse - 2;
11524 ret = regclass(pRExC_state, flagp,depth+1,
11525 TRUE, /* means just parse this element */
11526 FALSE, /* don't allow multi-char folds */
11527 FALSE, /* don't silence non-portable warnings.
11528 It would be a bug if these returned
11531 /* regclass() can only return RESTART_UTF8 if multi-char folds
11534 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11539 Set_Node_Offset(ret, parse_start + 2);
11540 Set_Node_Cur_Length(ret, parse_start);
11541 nextchar(pRExC_state);
11545 /* Handle \N and \N{NAME} with multiple code points here and not
11546 * below because it can be multicharacter. join_exact() will join
11547 * them up later on. Also this makes sure that things like
11548 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11549 * The options to the grok function call causes it to fail if the
11550 * sequence is just a single code point. We then go treat it as
11551 * just another character in the current EXACT node, and hence it
11552 * gets uniform treatment with all the other characters. The
11553 * special treatment for quantifiers is not needed for such single
11554 * character sequences */
11556 if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE,
11557 FALSE /* not strict */ )) {
11558 if (*flagp & RESTART_UTF8)
11564 case 'k': /* Handle \k<NAME> and \k'NAME' */
11567 char ch= RExC_parse[1];
11568 if (ch != '<' && ch != '\'' && ch != '{') {
11570 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11571 vFAIL2("Sequence %.2s... not terminated",parse_start);
11573 /* this pretty much dupes the code for (?P=...) in reg(), if
11574 you change this make sure you change that */
11575 char* name_start = (RExC_parse += 2);
11577 SV *sv_dat = reg_scan_name(pRExC_state,
11578 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11579 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11580 if (RExC_parse == name_start || *RExC_parse != ch)
11581 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11582 vFAIL2("Sequence %.3s... not terminated",parse_start);
11585 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11586 RExC_rxi->data->data[num]=(void*)sv_dat;
11587 SvREFCNT_inc_simple_void(sv_dat);
11591 ret = reganode(pRExC_state,
11594 : (ASCII_FOLD_RESTRICTED)
11596 : (AT_LEAST_UNI_SEMANTICS)
11602 *flagp |= HASWIDTH;
11604 /* override incorrect value set in reganode MJD */
11605 Set_Node_Offset(ret, parse_start+1);
11606 Set_Node_Cur_Length(ret, parse_start);
11607 nextchar(pRExC_state);
11613 case '1': case '2': case '3': case '4':
11614 case '5': case '6': case '7': case '8': case '9':
11619 if (*RExC_parse == 'g') {
11623 if (*RExC_parse == '{') {
11627 if (*RExC_parse == '-') {
11631 if (hasbrace && !isDIGIT(*RExC_parse)) {
11632 if (isrel) RExC_parse--;
11634 goto parse_named_seq;
11637 num = S_backref_value(RExC_parse);
11639 vFAIL("Reference to invalid group 0");
11640 else if (num == I32_MAX) {
11641 if (isDIGIT(*RExC_parse))
11642 vFAIL("Reference to nonexistent group");
11644 vFAIL("Unterminated \\g... pattern");
11648 num = RExC_npar - num;
11650 vFAIL("Reference to nonexistent or unclosed group");
11654 num = S_backref_value(RExC_parse);
11655 /* bare \NNN might be backref or octal - if it is larger than or equal
11656 * RExC_npar then it is assumed to be and octal escape.
11657 * Note RExC_npar is +1 from the actual number of parens*/
11658 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11659 && *RExC_parse != '8' && *RExC_parse != '9'))
11661 /* Probably a character specified in octal, e.g. \35 */
11666 /* at this point RExC_parse definitely points to a backref
11669 #ifdef RE_TRACK_PATTERN_OFFSETS
11670 char * const parse_start = RExC_parse - 1; /* MJD */
11672 while (isDIGIT(*RExC_parse))
11675 if (*RExC_parse != '}')
11676 vFAIL("Unterminated \\g{...} pattern");
11680 if (num > (I32)RExC_rx->nparens)
11681 vFAIL("Reference to nonexistent group");
11684 ret = reganode(pRExC_state,
11687 : (ASCII_FOLD_RESTRICTED)
11689 : (AT_LEAST_UNI_SEMANTICS)
11695 *flagp |= HASWIDTH;
11697 /* override incorrect value set in reganode MJD */
11698 Set_Node_Offset(ret, parse_start+1);
11699 Set_Node_Cur_Length(ret, parse_start);
11701 nextchar(pRExC_state);
11706 if (RExC_parse >= RExC_end)
11707 FAIL("Trailing \\");
11710 /* Do not generate "unrecognized" warnings here, we fall
11711 back into the quick-grab loop below */
11718 if (RExC_flags & RXf_PMf_EXTENDED) {
11719 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11720 if (RExC_parse < RExC_end)
11727 parse_start = RExC_parse - 1;
11736 #define MAX_NODE_STRING_SIZE 127
11737 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11739 U8 upper_parse = MAX_NODE_STRING_SIZE;
11740 U8 node_type = compute_EXACTish(pRExC_state);
11741 bool next_is_quantifier;
11742 char * oldp = NULL;
11744 /* We can convert EXACTF nodes to EXACTFU if they contain only
11745 * characters that match identically regardless of the target
11746 * string's UTF8ness. The reason to do this is that EXACTF is not
11747 * trie-able, EXACTFU is.
11749 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11750 * contain only above-Latin1 characters (hence must be in UTF8),
11751 * which don't participate in folds with Latin1-range characters,
11752 * as the latter's folds aren't known until runtime. (We don't
11753 * need to figure this out until pass 2) */
11754 bool maybe_exactfu = PASS2
11755 && (node_type == EXACTF || node_type == EXACTFL);
11757 /* If a folding node contains only code points that don't
11758 * participate in folds, it can be changed into an EXACT node,
11759 * which allows the optimizer more things to look for */
11762 ret = reg_node(pRExC_state, node_type);
11764 /* In pass1, folded, we use a temporary buffer instead of the
11765 * actual node, as the node doesn't exist yet */
11766 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11772 /* We do the EXACTFish to EXACT node only if folding. (And we
11773 * don't need to figure this out until pass 2) */
11774 maybe_exact = FOLD && PASS2;
11776 /* XXX The node can hold up to 255 bytes, yet this only goes to
11777 * 127. I (khw) do not know why. Keeping it somewhat less than
11778 * 255 allows us to not have to worry about overflow due to
11779 * converting to utf8 and fold expansion, but that value is
11780 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11781 * split up by this limit into a single one using the real max of
11782 * 255. Even at 127, this breaks under rare circumstances. If
11783 * folding, we do not want to split a node at a character that is a
11784 * non-final in a multi-char fold, as an input string could just
11785 * happen to want to match across the node boundary. The join
11786 * would solve that problem if the join actually happens. But a
11787 * series of more than two nodes in a row each of 127 would cause
11788 * the first join to succeed to get to 254, but then there wouldn't
11789 * be room for the next one, which could at be one of those split
11790 * multi-char folds. I don't know of any fool-proof solution. One
11791 * could back off to end with only a code point that isn't such a
11792 * non-final, but it is possible for there not to be any in the
11794 for (p = RExC_parse - 1;
11795 len < upper_parse && p < RExC_end;
11800 if (RExC_flags & RXf_PMf_EXTENDED)
11801 p = regpatws(pRExC_state, p,
11802 TRUE); /* means recognize comments */
11813 /* Literal Escapes Switch
11815 This switch is meant to handle escape sequences that
11816 resolve to a literal character.
11818 Every escape sequence that represents something
11819 else, like an assertion or a char class, is handled
11820 in the switch marked 'Special Escapes' above in this
11821 routine, but also has an entry here as anything that
11822 isn't explicitly mentioned here will be treated as
11823 an unescaped equivalent literal.
11826 switch ((U8)*++p) {
11827 /* These are all the special escapes. */
11828 case 'A': /* Start assertion */
11829 case 'b': case 'B': /* Word-boundary assertion*/
11830 case 'C': /* Single char !DANGEROUS! */
11831 case 'd': case 'D': /* digit class */
11832 case 'g': case 'G': /* generic-backref, pos assertion */
11833 case 'h': case 'H': /* HORIZWS */
11834 case 'k': case 'K': /* named backref, keep marker */
11835 case 'p': case 'P': /* Unicode property */
11836 case 'R': /* LNBREAK */
11837 case 's': case 'S': /* space class */
11838 case 'v': case 'V': /* VERTWS */
11839 case 'w': case 'W': /* word class */
11840 case 'X': /* eXtended Unicode "combining
11841 character sequence" */
11842 case 'z': case 'Z': /* End of line/string assertion */
11846 /* Anything after here is an escape that resolves to a
11847 literal. (Except digits, which may or may not)
11853 case 'N': /* Handle a single-code point named character. */
11854 /* The options cause it to fail if a multiple code
11855 * point sequence. Handle those in the switch() above
11857 RExC_parse = p + 1;
11858 if (! grok_bslash_N(pRExC_state, NULL, &ender,
11859 flagp, depth, FALSE,
11860 FALSE /* not strict */ ))
11862 if (*flagp & RESTART_UTF8)
11863 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11864 RExC_parse = p = oldp;
11868 if (ender > 0xff) {
11885 ender = ASCII_TO_NATIVE('\033');
11895 const char* error_msg;
11897 bool valid = grok_bslash_o(&p,
11900 TRUE, /* out warnings */
11901 FALSE, /* not strict */
11902 TRUE, /* Output warnings
11907 RExC_parse = p; /* going to die anyway; point
11908 to exact spot of failure */
11912 if (PL_encoding && ender < 0x100) {
11913 goto recode_encoding;
11915 if (ender > 0xff) {
11922 UV result = UV_MAX; /* initialize to erroneous
11924 const char* error_msg;
11926 bool valid = grok_bslash_x(&p,
11929 TRUE, /* out warnings */
11930 FALSE, /* not strict */
11931 TRUE, /* Output warnings
11936 RExC_parse = p; /* going to die anyway; point
11937 to exact spot of failure */
11942 if (PL_encoding && ender < 0x100) {
11943 goto recode_encoding;
11945 if (ender > 0xff) {
11952 ender = grok_bslash_c(*p++, SIZE_ONLY);
11954 case '8': case '9': /* must be a backreference */
11957 case '1': case '2': case '3':case '4':
11958 case '5': case '6': case '7':
11959 /* When we parse backslash escapes there is ambiguity
11960 * between backreferences and octal escapes. Any escape
11961 * from \1 - \9 is a backreference, any multi-digit
11962 * escape which does not start with 0 and which when
11963 * evaluated as decimal could refer to an already
11964 * parsed capture buffer is a backslash. Anything else
11967 * Note this implies that \118 could be interpreted as
11968 * 118 OR as "\11" . "8" depending on whether there
11969 * were 118 capture buffers defined already in the
11972 /* NOTE, RExC_npar is 1 more than the actual number of
11973 * parens we have seen so far, hence the < RExC_npar below. */
11975 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
11976 { /* Not to be treated as an octal constant, go
11984 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
11986 ender = grok_oct(p, &numlen, &flags, NULL);
11987 if (ender > 0xff) {
11991 if (SIZE_ONLY /* like \08, \178 */
11994 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
11996 reg_warn_non_literal_string(
11998 form_short_octal_warning(p, numlen));
12001 if (PL_encoding && ender < 0x100)
12002 goto recode_encoding;
12005 if (! RExC_override_recoding) {
12006 SV* enc = PL_encoding;
12007 ender = reg_recode((const char)(U8)ender, &enc);
12008 if (!enc && SIZE_ONLY)
12009 ckWARNreg(p, "Invalid escape in the specified encoding");
12015 FAIL("Trailing \\");
12018 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12019 /* Include any { following the alpha to emphasize
12020 * that it could be part of an escape at some point
12022 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12023 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12025 goto normal_default;
12026 } /* End of switch on '\' */
12029 /* Currently we don't warn when the lbrace is at the start
12030 * of a construct. This catches it in the middle of a
12031 * literal string, or when its the first thing after
12032 * something like "\b" */
12034 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12036 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12039 default: /* A literal character */
12041 if (UTF8_IS_START(*p) && UTF) {
12043 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12044 &numlen, UTF8_ALLOW_DEFAULT);
12050 } /* End of switch on the literal */
12052 /* Here, have looked at the literal character and <ender>
12053 * contains its ordinal, <p> points to the character after it
12056 if ( RExC_flags & RXf_PMf_EXTENDED)
12057 p = regpatws(pRExC_state, p,
12058 TRUE); /* means recognize comments */
12060 /* If the next thing is a quantifier, it applies to this
12061 * character only, which means that this character has to be in
12062 * its own node and can't just be appended to the string in an
12063 * existing node, so if there are already other characters in
12064 * the node, close the node with just them, and set up to do
12065 * this character again next time through, when it will be the
12066 * only thing in its new node */
12067 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12073 if (! FOLD /* The simple case, just append the literal */
12074 || (LOC /* Also don't fold for tricky chars under /l */
12075 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12078 const STRLEN unilen = reguni(pRExC_state, ender, s);
12084 /* The loop increments <len> each time, as all but this
12085 * path (and one other) through it add a single byte to
12086 * the EXACTish node. But this one has changed len to
12087 * be the correct final value, so subtract one to
12088 * cancel out the increment that follows */
12092 REGC((char)ender, s++);
12095 /* Can get here if folding only if is one of the /l
12096 * characters whose fold depends on the locale. The
12097 * occurrence of any of these indicate that we can't
12098 * simplify things */
12100 maybe_exact = FALSE;
12101 maybe_exactfu = FALSE;
12106 /* See comments for join_exact() as to why we fold this
12107 * non-UTF at compile time */
12108 || (node_type == EXACTFU
12109 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12111 /* Here, are folding and are not UTF-8 encoded; therefore
12112 * the character must be in the range 0-255, and is not /l
12113 * (Not /l because we already handled these under /l in
12114 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12115 if (IS_IN_SOME_FOLD_L1(ender)) {
12116 maybe_exact = FALSE;
12118 /* See if the character's fold differs between /d and
12119 * /u. This includes the multi-char fold SHARP S to
12122 && (PL_fold[ender] != PL_fold_latin1[ender]
12123 || ender == LATIN_SMALL_LETTER_SHARP_S
12125 && isARG2_lower_or_UPPER_ARG1('s', ender)
12126 && isARG2_lower_or_UPPER_ARG1('s',
12129 maybe_exactfu = FALSE;
12133 /* Even when folding, we store just the input character, as
12134 * we have an array that finds its fold quickly */
12135 *(s++) = (char) ender;
12137 else { /* FOLD and UTF */
12138 /* Unlike the non-fold case, we do actually have to
12139 * calculate the results here in pass 1. This is for two
12140 * reasons, the folded length may be longer than the
12141 * unfolded, and we have to calculate how many EXACTish
12142 * nodes it will take; and we may run out of room in a node
12143 * in the middle of a potential multi-char fold, and have
12144 * to back off accordingly. (Hence we can't use REGC for
12145 * the simple case just below.) */
12148 if (isASCII(ender)) {
12149 folded = toFOLD(ender);
12150 *(s)++ = (U8) folded;
12155 folded = _to_uni_fold_flags(
12159 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12160 ? FOLD_FLAGS_NOMIX_ASCII
12164 /* The loop increments <len> each time, as all but this
12165 * path (and one other) through it add a single byte to
12166 * the EXACTish node. But this one has changed len to
12167 * be the correct final value, so subtract one to
12168 * cancel out the increment that follows */
12169 len += foldlen - 1;
12171 /* If this node only contains non-folding code points so
12172 * far, see if this new one is also non-folding */
12174 if (folded != ender) {
12175 maybe_exact = FALSE;
12178 /* Here the fold is the original; we have to check
12179 * further to see if anything folds to it */
12180 if (_invlist_contains_cp(PL_utf8_foldable,
12183 maybe_exact = FALSE;
12190 if (next_is_quantifier) {
12192 /* Here, the next input is a quantifier, and to get here,
12193 * the current character is the only one in the node.
12194 * Also, here <len> doesn't include the final byte for this
12200 } /* End of loop through literal characters */
12202 /* Here we have either exhausted the input or ran out of room in
12203 * the node. (If we encountered a character that can't be in the
12204 * node, transfer is made directly to <loopdone>, and so we
12205 * wouldn't have fallen off the end of the loop.) In the latter
12206 * case, we artificially have to split the node into two, because
12207 * we just don't have enough space to hold everything. This
12208 * creates a problem if the final character participates in a
12209 * multi-character fold in the non-final position, as a match that
12210 * should have occurred won't, due to the way nodes are matched,
12211 * and our artificial boundary. So back off until we find a non-
12212 * problematic character -- one that isn't at the beginning or
12213 * middle of such a fold. (Either it doesn't participate in any
12214 * folds, or appears only in the final position of all the folds it
12215 * does participate in.) A better solution with far fewer false
12216 * positives, and that would fill the nodes more completely, would
12217 * be to actually have available all the multi-character folds to
12218 * test against, and to back-off only far enough to be sure that
12219 * this node isn't ending with a partial one. <upper_parse> is set
12220 * further below (if we need to reparse the node) to include just
12221 * up through that final non-problematic character that this code
12222 * identifies, so when it is set to less than the full node, we can
12223 * skip the rest of this */
12224 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12226 const STRLEN full_len = len;
12228 assert(len >= MAX_NODE_STRING_SIZE);
12230 /* Here, <s> points to the final byte of the final character.
12231 * Look backwards through the string until find a non-
12232 * problematic character */
12236 /* This has no multi-char folds to non-UTF characters */
12237 if (ASCII_FOLD_RESTRICTED) {
12241 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12245 if (! PL_NonL1NonFinalFold) {
12246 PL_NonL1NonFinalFold = _new_invlist_C_array(
12247 NonL1_Perl_Non_Final_Folds_invlist);
12250 /* Point to the first byte of the final character */
12251 s = (char *) utf8_hop((U8 *) s, -1);
12253 while (s >= s0) { /* Search backwards until find
12254 non-problematic char */
12255 if (UTF8_IS_INVARIANT(*s)) {
12257 /* There are no ascii characters that participate
12258 * in multi-char folds under /aa. In EBCDIC, the
12259 * non-ascii invariants are all control characters,
12260 * so don't ever participate in any folds. */
12261 if (ASCII_FOLD_RESTRICTED
12262 || ! IS_NON_FINAL_FOLD(*s))
12267 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12268 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12274 else if (! _invlist_contains_cp(
12275 PL_NonL1NonFinalFold,
12276 valid_utf8_to_uvchr((U8 *) s, NULL)))
12281 /* Here, the current character is problematic in that
12282 * it does occur in the non-final position of some
12283 * fold, so try the character before it, but have to
12284 * special case the very first byte in the string, so
12285 * we don't read outside the string */
12286 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12287 } /* End of loop backwards through the string */
12289 /* If there were only problematic characters in the string,
12290 * <s> will point to before s0, in which case the length
12291 * should be 0, otherwise include the length of the
12292 * non-problematic character just found */
12293 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12296 /* Here, have found the final character, if any, that is
12297 * non-problematic as far as ending the node without splitting
12298 * it across a potential multi-char fold. <len> contains the
12299 * number of bytes in the node up-to and including that
12300 * character, or is 0 if there is no such character, meaning
12301 * the whole node contains only problematic characters. In
12302 * this case, give up and just take the node as-is. We can't
12307 /* If the node ends in an 's' we make sure it stays EXACTF,
12308 * as if it turns into an EXACTFU, it could later get
12309 * joined with another 's' that would then wrongly match
12311 if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
12313 maybe_exactfu = FALSE;
12317 /* Here, the node does contain some characters that aren't
12318 * problematic. If one such is the final character in the
12319 * node, we are done */
12320 if (len == full_len) {
12323 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12325 /* If the final character is problematic, but the
12326 * penultimate is not, back-off that last character to
12327 * later start a new node with it */
12332 /* Here, the final non-problematic character is earlier
12333 * in the input than the penultimate character. What we do
12334 * is reparse from the beginning, going up only as far as
12335 * this final ok one, thus guaranteeing that the node ends
12336 * in an acceptable character. The reason we reparse is
12337 * that we know how far in the character is, but we don't
12338 * know how to correlate its position with the input parse.
12339 * An alternate implementation would be to build that
12340 * correlation as we go along during the original parse,
12341 * but that would entail extra work for every node, whereas
12342 * this code gets executed only when the string is too
12343 * large for the node, and the final two characters are
12344 * problematic, an infrequent occurrence. Yet another
12345 * possible strategy would be to save the tail of the
12346 * string, and the next time regatom is called, initialize
12347 * with that. The problem with this is that unless you
12348 * back off one more character, you won't be guaranteed
12349 * regatom will get called again, unless regbranch,
12350 * regpiece ... are also changed. If you do back off that
12351 * extra character, so that there is input guaranteed to
12352 * force calling regatom, you can't handle the case where
12353 * just the first character in the node is acceptable. I
12354 * (khw) decided to try this method which doesn't have that
12355 * pitfall; if performance issues are found, we can do a
12356 * combination of the current approach plus that one */
12362 } /* End of verifying node ends with an appropriate char */
12364 loopdone: /* Jumped to when encounters something that shouldn't be in
12367 /* I (khw) don't know if you can get here with zero length, but the
12368 * old code handled this situation by creating a zero-length EXACT
12369 * node. Might as well be NOTHING instead */
12375 /* If 'maybe_exact' is still set here, means there are no
12376 * code points in the node that participate in folds;
12377 * similarly for 'maybe_exactfu' and code points that match
12378 * differently depending on UTF8ness of the target string
12379 * (for /u), or depending on locale for /l */
12383 else if (maybe_exactfu) {
12387 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12388 FALSE /* Don't look to see if could
12389 be turned into an EXACT
12390 node, as we have already
12395 RExC_parse = p - 1;
12396 Set_Node_Cur_Length(ret, parse_start);
12397 nextchar(pRExC_state);
12399 /* len is STRLEN which is unsigned, need to copy to signed */
12402 vFAIL("Internal disaster");
12405 } /* End of label 'defchar:' */
12407 } /* End of giant switch on input character */
12413 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12415 /* Returns the next non-pattern-white space, non-comment character (the
12416 * latter only if 'recognize_comment is true) in the string p, which is
12417 * ended by RExC_end. See also reg_skipcomment */
12418 const char *e = RExC_end;
12420 PERL_ARGS_ASSERT_REGPATWS;
12424 if ((len = is_PATWS_safe(p, e, UTF))) {
12427 else if (recognize_comment && *p == '#') {
12428 p = reg_skipcomment(pRExC_state, p);
12437 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12439 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12440 * sets up the bitmap and any flags, removing those code points from the
12441 * inversion list, setting it to NULL should it become completely empty */
12443 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12444 assert(PL_regkind[OP(node)] == ANYOF);
12446 ANYOF_BITMAP_ZERO(node);
12447 if (*invlist_ptr) {
12449 /* This gets set if we actually need to modify things */
12450 bool change_invlist = FALSE;
12454 /* Start looking through *invlist_ptr */
12455 invlist_iterinit(*invlist_ptr);
12456 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12460 if (end == UV_MAX && start <= 256) {
12461 ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
12463 else if (end >= 256) {
12464 ANYOF_FLAGS(node) |= ANYOF_UTF8;
12467 /* Quit if are above what we should change */
12472 change_invlist = TRUE;
12474 /* Set all the bits in the range, up to the max that we are doing */
12475 high = (end < 255) ? end : 255;
12476 for (i = start; i <= (int) high; i++) {
12477 if (! ANYOF_BITMAP_TEST(node, i)) {
12478 ANYOF_BITMAP_SET(node, i);
12482 invlist_iterfinish(*invlist_ptr);
12484 /* Done with loop; remove any code points that are in the bitmap from
12485 * *invlist_ptr; similarly for code points above latin1 if we have a
12486 * flag to match all of them anyways */
12487 if (change_invlist) {
12488 _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr);
12490 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
12491 _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr);
12494 /* If have completely emptied it, remove it completely */
12495 if (_invlist_len(*invlist_ptr) == 0) {
12496 SvREFCNT_dec_NN(*invlist_ptr);
12497 *invlist_ptr = NULL;
12502 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12503 Character classes ([:foo:]) can also be negated ([:^foo:]).
12504 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12505 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12506 but trigger failures because they are currently unimplemented. */
12508 #define POSIXCC_DONE(c) ((c) == ':')
12509 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12510 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12512 PERL_STATIC_INLINE I32
12513 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12516 I32 namedclass = OOB_NAMEDCLASS;
12518 PERL_ARGS_ASSERT_REGPPOSIXCC;
12520 if (value == '[' && RExC_parse + 1 < RExC_end &&
12521 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12522 POSIXCC(UCHARAT(RExC_parse)))
12524 const char c = UCHARAT(RExC_parse);
12525 char* const s = RExC_parse++;
12527 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12529 if (RExC_parse == RExC_end) {
12532 /* Try to give a better location for the error (than the end of
12533 * the string) by looking for the matching ']' */
12535 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12538 vFAIL2("Unmatched '%c' in POSIX class", c);
12540 /* Grandfather lone [:, [=, [. */
12544 const char* const t = RExC_parse++; /* skip over the c */
12547 if (UCHARAT(RExC_parse) == ']') {
12548 const char *posixcc = s + 1;
12549 RExC_parse++; /* skip over the ending ] */
12552 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12553 const I32 skip = t - posixcc;
12555 /* Initially switch on the length of the name. */
12558 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12559 this is the Perl \w
12561 namedclass = ANYOF_WORDCHAR;
12564 /* Names all of length 5. */
12565 /* alnum alpha ascii blank cntrl digit graph lower
12566 print punct space upper */
12567 /* Offset 4 gives the best switch position. */
12568 switch (posixcc[4]) {
12570 if (memEQ(posixcc, "alph", 4)) /* alpha */
12571 namedclass = ANYOF_ALPHA;
12574 if (memEQ(posixcc, "spac", 4)) /* space */
12575 namedclass = ANYOF_PSXSPC;
12578 if (memEQ(posixcc, "grap", 4)) /* graph */
12579 namedclass = ANYOF_GRAPH;
12582 if (memEQ(posixcc, "asci", 4)) /* ascii */
12583 namedclass = ANYOF_ASCII;
12586 if (memEQ(posixcc, "blan", 4)) /* blank */
12587 namedclass = ANYOF_BLANK;
12590 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12591 namedclass = ANYOF_CNTRL;
12594 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12595 namedclass = ANYOF_ALPHANUMERIC;
12598 if (memEQ(posixcc, "lowe", 4)) /* lower */
12599 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12600 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12601 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12604 if (memEQ(posixcc, "digi", 4)) /* digit */
12605 namedclass = ANYOF_DIGIT;
12606 else if (memEQ(posixcc, "prin", 4)) /* print */
12607 namedclass = ANYOF_PRINT;
12608 else if (memEQ(posixcc, "punc", 4)) /* punct */
12609 namedclass = ANYOF_PUNCT;
12614 if (memEQ(posixcc, "xdigit", 6))
12615 namedclass = ANYOF_XDIGIT;
12619 if (namedclass == OOB_NAMEDCLASS)
12621 "POSIX class [:%"UTF8f":] unknown",
12622 UTF8fARG(UTF, t - s - 1, s + 1));
12624 /* The #defines are structured so each complement is +1 to
12625 * the normal one */
12629 assert (posixcc[skip] == ':');
12630 assert (posixcc[skip+1] == ']');
12631 } else if (!SIZE_ONLY) {
12632 /* [[=foo=]] and [[.foo.]] are still future. */
12634 /* adjust RExC_parse so the warning shows after
12635 the class closes */
12636 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12638 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12641 /* Maternal grandfather:
12642 * "[:" ending in ":" but not in ":]" */
12644 vFAIL("Unmatched '[' in POSIX class");
12647 /* Grandfather lone [:, [=, [. */
12657 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12659 /* This applies some heuristics at the current parse position (which should
12660 * be at a '[') to see if what follows might be intended to be a [:posix:]
12661 * class. It returns true if it really is a posix class, of course, but it
12662 * also can return true if it thinks that what was intended was a posix
12663 * class that didn't quite make it.
12665 * It will return true for
12667 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12668 * ')' indicating the end of the (?[
12669 * [:any garbage including %^&$ punctuation:]
12671 * This is designed to be called only from S_handle_regex_sets; it could be
12672 * easily adapted to be called from the spot at the beginning of regclass()
12673 * that checks to see in a normal bracketed class if the surrounding []
12674 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12675 * change long-standing behavior, so I (khw) didn't do that */
12676 char* p = RExC_parse + 1;
12677 char first_char = *p;
12679 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12681 assert(*(p - 1) == '[');
12683 if (! POSIXCC(first_char)) {
12688 while (p < RExC_end && isWORDCHAR(*p)) p++;
12690 if (p >= RExC_end) {
12694 if (p - RExC_parse > 2 /* Got at least 1 word character */
12695 && (*p == first_char
12696 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12701 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12704 && p - RExC_parse > 2 /* [:] evaluates to colon;
12705 [::] is a bad posix class. */
12706 && first_char == *(p - 1));
12710 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12711 I32 *flagp, U32 depth,
12712 char * const oregcomp_parse)
12714 /* Handle the (?[...]) construct to do set operations */
12717 UV start, end; /* End points of code point ranges */
12719 char *save_end, *save_parse;
12724 const bool save_fold = FOLD;
12726 GET_RE_DEBUG_FLAGS_DECL;
12728 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12731 vFAIL("(?[...]) not valid in locale");
12733 RExC_uni_semantics = 1;
12735 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12736 * (such as EXACT). Thus we can skip most everything if just sizing. We
12737 * call regclass to handle '[]' so as to not have to reinvent its parsing
12738 * rules here (throwing away the size it computes each time). And, we exit
12739 * upon an unescaped ']' that isn't one ending a regclass. To do both
12740 * these things, we need to realize that something preceded by a backslash
12741 * is escaped, so we have to keep track of backslashes */
12743 UV depth = 0; /* how many nested (?[...]) constructs */
12745 Perl_ck_warner_d(aTHX_
12746 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12747 "The regex_sets feature is experimental" REPORT_LOCATION,
12748 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12750 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12751 RExC_precomp + (RExC_parse - RExC_precomp)));
12753 while (RExC_parse < RExC_end) {
12754 SV* current = NULL;
12755 RExC_parse = regpatws(pRExC_state, RExC_parse,
12756 TRUE); /* means recognize comments */
12757 switch (*RExC_parse) {
12759 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12764 /* Skip the next byte (which could cause us to end up in
12765 * the middle of a UTF-8 character, but since none of those
12766 * are confusable with anything we currently handle in this
12767 * switch (invariants all), it's safe. We'll just hit the
12768 * default: case next time and keep on incrementing until
12769 * we find one of the invariants we do handle. */
12774 /* If this looks like it is a [:posix:] class, leave the
12775 * parse pointer at the '[' to fool regclass() into
12776 * thinking it is part of a '[[:posix:]]'. That function
12777 * will use strict checking to force a syntax error if it
12778 * doesn't work out to a legitimate class */
12779 bool is_posix_class
12780 = could_it_be_a_POSIX_class(pRExC_state);
12781 if (! is_posix_class) {
12785 /* regclass() can only return RESTART_UTF8 if multi-char
12786 folds are allowed. */
12787 if (!regclass(pRExC_state, flagp,depth+1,
12788 is_posix_class, /* parse the whole char
12789 class only if not a
12791 FALSE, /* don't allow multi-char folds */
12792 TRUE, /* silence non-portable warnings. */
12794 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12797 /* function call leaves parse pointing to the ']', except
12798 * if we faked it */
12799 if (is_posix_class) {
12803 SvREFCNT_dec(current); /* In case it returned something */
12808 if (depth--) break;
12810 if (RExC_parse < RExC_end
12811 && *RExC_parse == ')')
12813 node = reganode(pRExC_state, ANYOF, 0);
12814 RExC_size += ANYOF_SKIP;
12815 nextchar(pRExC_state);
12816 Set_Node_Length(node,
12817 RExC_parse - oregcomp_parse + 1); /* MJD */
12826 FAIL("Syntax error in (?[...])");
12829 /* Pass 2 only after this. Everything in this construct is a
12830 * metacharacter. Operands begin with either a '\' (for an escape
12831 * sequence), or a '[' for a bracketed character class. Any other
12832 * character should be an operator, or parenthesis for grouping. Both
12833 * types of operands are handled by calling regclass() to parse them. It
12834 * is called with a parameter to indicate to return the computed inversion
12835 * list. The parsing here is implemented via a stack. Each entry on the
12836 * stack is a single character representing one of the operators, or the
12837 * '('; or else a pointer to an operand inversion list. */
12839 #define IS_OPERAND(a) (! SvIOK(a))
12841 /* The stack starts empty. It is a syntax error if the first thing parsed
12842 * is a binary operator; everything else is pushed on the stack. When an
12843 * operand is parsed, the top of the stack is examined. If it is a binary
12844 * operator, the item before it should be an operand, and both are replaced
12845 * by the result of doing that operation on the new operand and the one on
12846 * the stack. Thus a sequence of binary operands is reduced to a single
12847 * one before the next one is parsed.
12849 * A unary operator may immediately follow a binary in the input, for
12852 * When an operand is parsed and the top of the stack is a unary operator,
12853 * the operation is performed, and then the stack is rechecked to see if
12854 * this new operand is part of a binary operation; if so, it is handled as
12857 * A '(' is simply pushed on the stack; it is valid only if the stack is
12858 * empty, or the top element of the stack is an operator or another '('
12859 * (for which the parenthesized expression will become an operand). By the
12860 * time the corresponding ')' is parsed everything in between should have
12861 * been parsed and evaluated to a single operand (or else is a syntax
12862 * error), and is handled as a regular operand */
12864 sv_2mortal((SV *)(stack = newAV()));
12866 while (RExC_parse < RExC_end) {
12867 I32 top_index = av_tindex(stack);
12869 SV* current = NULL;
12871 /* Skip white space */
12872 RExC_parse = regpatws(pRExC_state, RExC_parse,
12873 TRUE /* means recognize comments */ );
12874 if (RExC_parse >= RExC_end) {
12875 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12877 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12884 if (av_tindex(stack) >= 0 /* This makes sure that we can
12885 safely subtract 1 from
12886 RExC_parse in the next clause.
12887 If we have something on the
12888 stack, we have parsed something
12890 && UCHARAT(RExC_parse - 1) == '('
12891 && RExC_parse < RExC_end)
12893 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12894 * This happens when we have some thing like
12896 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12898 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12900 * Here we would be handling the interpolated
12901 * '$thai_or_lao'. We handle this by a recursive call to
12902 * ourselves which returns the inversion list the
12903 * interpolated expression evaluates to. We use the flags
12904 * from the interpolated pattern. */
12905 U32 save_flags = RExC_flags;
12906 const char * const save_parse = ++RExC_parse;
12908 parse_lparen_question_flags(pRExC_state);
12910 if (RExC_parse == save_parse /* Makes sure there was at
12911 least one flag (or this
12912 embedding wasn't compiled)
12914 || RExC_parse >= RExC_end - 4
12915 || UCHARAT(RExC_parse) != ':'
12916 || UCHARAT(++RExC_parse) != '('
12917 || UCHARAT(++RExC_parse) != '?'
12918 || UCHARAT(++RExC_parse) != '[')
12921 /* In combination with the above, this moves the
12922 * pointer to the point just after the first erroneous
12923 * character (or if there are no flags, to where they
12924 * should have been) */
12925 if (RExC_parse >= RExC_end - 4) {
12926 RExC_parse = RExC_end;
12928 else if (RExC_parse != save_parse) {
12929 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12931 vFAIL("Expecting '(?flags:(?[...'");
12934 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
12935 depth+1, oregcomp_parse);
12937 /* Here, 'current' contains the embedded expression's
12938 * inversion list, and RExC_parse points to the trailing
12939 * ']'; the next character should be the ')' which will be
12940 * paired with the '(' that has been put on the stack, so
12941 * the whole embedded expression reduces to '(operand)' */
12944 RExC_flags = save_flags;
12945 goto handle_operand;
12950 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
12951 vFAIL("Unexpected character");
12954 /* regclass() can only return RESTART_UTF8 if multi-char
12955 folds are allowed. */
12956 if (!regclass(pRExC_state, flagp,depth+1,
12957 TRUE, /* means parse just the next thing */
12958 FALSE, /* don't allow multi-char folds */
12959 FALSE, /* don't silence non-portable warnings. */
12961 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12963 /* regclass() will return with parsing just the \ sequence,
12964 * leaving the parse pointer at the next thing to parse */
12966 goto handle_operand;
12968 case '[': /* Is a bracketed character class */
12970 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
12972 if (! is_posix_class) {
12976 /* regclass() can only return RESTART_UTF8 if multi-char
12977 folds are allowed. */
12978 if(!regclass(pRExC_state, flagp,depth+1,
12979 is_posix_class, /* parse the whole char class
12980 only if not a posix class */
12981 FALSE, /* don't allow multi-char folds */
12982 FALSE, /* don't silence non-portable warnings. */
12984 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12986 /* function call leaves parse pointing to the ']', except if we
12988 if (is_posix_class) {
12992 goto handle_operand;
13001 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13002 || ! IS_OPERAND(*top_ptr))
13005 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13007 av_push(stack, newSVuv(curchar));
13011 av_push(stack, newSVuv(curchar));
13015 if (top_index >= 0) {
13016 top_ptr = av_fetch(stack, top_index, FALSE);
13018 if (IS_OPERAND(*top_ptr)) {
13020 vFAIL("Unexpected '(' with no preceding operator");
13023 av_push(stack, newSVuv(curchar));
13030 || ! (current = av_pop(stack))
13031 || ! IS_OPERAND(current)
13032 || ! (lparen = av_pop(stack))
13033 || IS_OPERAND(lparen)
13034 || SvUV(lparen) != '(')
13036 SvREFCNT_dec(current);
13038 vFAIL("Unexpected ')'");
13041 SvREFCNT_dec_NN(lparen);
13048 /* Here, we have an operand to process, in 'current' */
13050 if (top_index < 0) { /* Just push if stack is empty */
13051 av_push(stack, current);
13054 SV* top = av_pop(stack);
13056 char current_operator;
13058 if (IS_OPERAND(top)) {
13059 SvREFCNT_dec_NN(top);
13060 SvREFCNT_dec_NN(current);
13061 vFAIL("Operand with no preceding operator");
13063 current_operator = (char) SvUV(top);
13064 switch (current_operator) {
13065 case '(': /* Push the '(' back on followed by the new
13067 av_push(stack, top);
13068 av_push(stack, current);
13069 SvREFCNT_inc(top); /* Counters the '_dec' done
13070 just after the 'break', so
13071 it doesn't get wrongly freed
13076 _invlist_invert(current);
13078 /* Unlike binary operators, the top of the stack,
13079 * now that this unary one has been popped off, may
13080 * legally be an operator, and we now have operand
13083 SvREFCNT_dec_NN(top);
13084 goto handle_operand;
13087 prev = av_pop(stack);
13088 _invlist_intersection(prev,
13091 av_push(stack, current);
13096 prev = av_pop(stack);
13097 _invlist_union(prev, current, ¤t);
13098 av_push(stack, current);
13102 prev = av_pop(stack);;
13103 _invlist_subtract(prev, current, ¤t);
13104 av_push(stack, current);
13107 case '^': /* The union minus the intersection */
13113 prev = av_pop(stack);
13114 _invlist_union(prev, current, &u);
13115 _invlist_intersection(prev, current, &i);
13116 /* _invlist_subtract will overwrite current
13117 without freeing what it already contains */
13119 _invlist_subtract(u, i, ¤t);
13120 av_push(stack, current);
13121 SvREFCNT_dec_NN(i);
13122 SvREFCNT_dec_NN(u);
13123 SvREFCNT_dec_NN(element);
13128 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13130 SvREFCNT_dec_NN(top);
13131 SvREFCNT_dec(prev);
13135 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13138 if (av_tindex(stack) < 0 /* Was empty */
13139 || ((final = av_pop(stack)) == NULL)
13140 || ! IS_OPERAND(final)
13141 || av_tindex(stack) >= 0) /* More left on stack */
13143 vFAIL("Incomplete expression within '(?[ ])'");
13146 /* Here, 'final' is the resultant inversion list from evaluating the
13147 * expression. Return it if so requested */
13148 if (return_invlist) {
13149 *return_invlist = final;
13153 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13154 * expecting a string of ranges and individual code points */
13155 invlist_iterinit(final);
13156 result_string = newSVpvs("");
13157 while (invlist_iternext(final, &start, &end)) {
13158 if (start == end) {
13159 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13162 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13167 save_parse = RExC_parse;
13168 RExC_parse = SvPV(result_string, len);
13169 save_end = RExC_end;
13170 RExC_end = RExC_parse + len;
13172 /* We turn off folding around the call, as the class we have constructed
13173 * already has all folding taken into consideration, and we don't want
13174 * regclass() to add to that */
13175 RExC_flags &= ~RXf_PMf_FOLD;
13176 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13178 node = regclass(pRExC_state, flagp,depth+1,
13179 FALSE, /* means parse the whole char class */
13180 FALSE, /* don't allow multi-char folds */
13181 TRUE, /* silence non-portable warnings. The above may very
13182 well have generated non-portable code points, but
13183 they're valid on this machine */
13186 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13189 RExC_flags |= RXf_PMf_FOLD;
13191 RExC_parse = save_parse + 1;
13192 RExC_end = save_end;
13193 SvREFCNT_dec_NN(final);
13194 SvREFCNT_dec_NN(result_string);
13196 nextchar(pRExC_state);
13197 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13203 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13205 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13206 * innocent-looking character class, like /[ks]/i won't have to go out to
13207 * disk to find the possible matches.
13209 * This should be called only for a Latin1-range code points, cp, which is
13210 * known to be involved in a fold with other code points above Latin1. It
13211 * would give false results if /aa has been specified. Multi-char folds
13212 * are outside the scope of this, and must be handled specially.
13214 * XXX It would be better to generate these via regen, in case a new
13215 * version of the Unicode standard adds new mappings, though that is not
13216 * really likely, and may be caught by the default: case of the switch
13219 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13225 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13229 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13232 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13233 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13235 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13236 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13237 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13239 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13240 *invlist = add_cp_to_invlist(*invlist,
13241 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13243 case LATIN_SMALL_LETTER_SHARP_S:
13244 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13246 case 'F': case 'f':
13247 case 'I': case 'i':
13248 case 'L': case 'l':
13249 case 'T': case 't':
13250 case 'A': case 'a':
13251 case 'H': case 'h':
13252 case 'J': case 'j':
13253 case 'N': case 'n':
13254 case 'W': case 'w':
13255 case 'Y': case 'y':
13256 /* These all are targets of multi-character folds from code points
13257 * that require UTF8 to express, so they can't match unless the
13258 * target string is in UTF-8, so no action here is necessary, as
13259 * regexec.c properly handles the general case for UTF-8 matching
13260 * and multi-char folds */
13263 /* Use deprecated warning to increase the chances of this being
13265 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13270 /* The names of properties whose definitions are not known at compile time are
13271 * stored in this SV, after a constant heading. So if the length has been
13272 * changed since initialization, then there is a run-time definition. */
13273 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13274 (SvCUR(listsv) != initial_listsv_len)
13277 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13278 const bool stop_at_1, /* Just parse the next thing, don't
13279 look for a full character class */
13280 bool allow_multi_folds,
13281 const bool silence_non_portable, /* Don't output warnings
13284 SV** ret_invlist) /* Return an inversion list, not a node */
13286 /* parse a bracketed class specification. Most of these will produce an
13287 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13288 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13289 * under /i with multi-character folds: it will be rewritten following the
13290 * paradigm of this example, where the <multi-fold>s are characters which
13291 * fold to multiple character sequences:
13292 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13293 * gets effectively rewritten as:
13294 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13295 * reg() gets called (recursively) on the rewritten version, and this
13296 * function will return what it constructs. (Actually the <multi-fold>s
13297 * aren't physically removed from the [abcdefghi], it's just that they are
13298 * ignored in the recursion by means of a flag:
13299 * <RExC_in_multi_char_class>.)
13301 * ANYOF nodes contain a bit map for the first 256 characters, with the
13302 * corresponding bit set if that character is in the list. For characters
13303 * above 255, a range list or swash is used. There are extra bits for \w,
13304 * etc. in locale ANYOFs, as what these match is not determinable at
13307 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13308 * to be restarted. This can only happen if ret_invlist is non-NULL.
13312 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13314 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13317 IV namedclass = OOB_NAMEDCLASS;
13318 char *rangebegin = NULL;
13319 bool need_class = 0;
13321 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13322 than just initialized. */
13323 SV* properties = NULL; /* Code points that match \p{} \P{} */
13324 SV* posixes = NULL; /* Code points that match classes like [:word:],
13325 extended beyond the Latin1 range. These have to
13326 be kept separate from other code points for much
13327 of this function because their handling is
13328 different under /i, and for most classes under
13330 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13331 separate for a while from the non-complemented
13332 versions because of complications with /d
13334 UV element_count = 0; /* Number of distinct elements in the class.
13335 Optimizations may be possible if this is tiny */
13336 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13337 character; used under /i */
13339 char * stop_ptr = RExC_end; /* where to stop parsing */
13340 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13342 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13344 /* Unicode properties are stored in a swash; this holds the current one
13345 * being parsed. If this swash is the only above-latin1 component of the
13346 * character class, an optimization is to pass it directly on to the
13347 * execution engine. Otherwise, it is set to NULL to indicate that there
13348 * are other things in the class that have to be dealt with at execution
13350 SV* swash = NULL; /* Code points that match \p{} \P{} */
13352 /* Set if a component of this character class is user-defined; just passed
13353 * on to the engine */
13354 bool has_user_defined_property = FALSE;
13356 /* inversion list of code points this node matches only when the target
13357 * string is in UTF-8. (Because is under /d) */
13358 SV* depends_list = NULL;
13360 /* Inversion list of code points this node matches regardless of things
13361 * like locale, folding, utf8ness of the target string */
13362 SV* cp_list = NULL;
13364 /* Like cp_list, but code points on this list need to be checked for things
13365 * that fold to/from them under /i */
13366 SV* cp_foldable_list = NULL;
13368 /* Like cp_list, but code points on this list are valid only when the
13369 * runtime locale is UTF-8 */
13370 SV* only_utf8_locale_list = NULL;
13373 /* In a range, counts how many 0-2 of the ends of it came from literals,
13374 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13375 UV literal_endpoint = 0;
13377 bool invert = FALSE; /* Is this class to be complemented */
13379 bool warn_super = ALWAYS_WARN_SUPER;
13381 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13382 case we need to change the emitted regop to an EXACT. */
13383 const char * orig_parse = RExC_parse;
13384 const SSize_t orig_size = RExC_size;
13385 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13386 GET_RE_DEBUG_FLAGS_DECL;
13388 PERL_ARGS_ASSERT_REGCLASS;
13390 PERL_UNUSED_ARG(depth);
13393 DEBUG_PARSE("clas");
13395 /* Assume we are going to generate an ANYOF node. */
13396 ret = reganode(pRExC_state, ANYOF, 0);
13399 RExC_size += ANYOF_SKIP;
13400 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13403 ANYOF_FLAGS(ret) = 0;
13405 RExC_emit += ANYOF_SKIP;
13406 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13407 initial_listsv_len = SvCUR(listsv);
13408 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13412 RExC_parse = regpatws(pRExC_state, RExC_parse,
13413 FALSE /* means don't recognize comments */ );
13416 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13419 allow_multi_folds = FALSE;
13422 RExC_parse = regpatws(pRExC_state, RExC_parse,
13423 FALSE /* means don't recognize comments */ );
13427 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13428 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13429 const char *s = RExC_parse;
13430 const char c = *s++;
13432 while (isWORDCHAR(*s))
13434 if (*s && c == *s && s[1] == ']') {
13435 SAVEFREESV(RExC_rx_sv);
13437 "POSIX syntax [%c %c] belongs inside character classes",
13439 (void)ReREFCNT_inc(RExC_rx_sv);
13443 /* If the caller wants us to just parse a single element, accomplish this
13444 * by faking the loop ending condition */
13445 if (stop_at_1 && RExC_end > RExC_parse) {
13446 stop_ptr = RExC_parse + 1;
13449 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13450 if (UCHARAT(RExC_parse) == ']')
13451 goto charclassloop;
13455 if (RExC_parse >= stop_ptr) {
13460 RExC_parse = regpatws(pRExC_state, RExC_parse,
13461 FALSE /* means don't recognize comments */ );
13464 if (UCHARAT(RExC_parse) == ']') {
13470 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13471 save_value = value;
13472 save_prevvalue = prevvalue;
13475 rangebegin = RExC_parse;
13479 value = utf8n_to_uvchr((U8*)RExC_parse,
13480 RExC_end - RExC_parse,
13481 &numlen, UTF8_ALLOW_DEFAULT);
13482 RExC_parse += numlen;
13485 value = UCHARAT(RExC_parse++);
13488 && RExC_parse < RExC_end
13489 && POSIXCC(UCHARAT(RExC_parse)))
13491 namedclass = regpposixcc(pRExC_state, value, strict);
13493 else if (value == '\\') {
13495 value = utf8n_to_uvchr((U8*)RExC_parse,
13496 RExC_end - RExC_parse,
13497 &numlen, UTF8_ALLOW_DEFAULT);
13498 RExC_parse += numlen;
13501 value = UCHARAT(RExC_parse++);
13503 /* Some compilers cannot handle switching on 64-bit integer
13504 * values, therefore value cannot be an UV. Yes, this will
13505 * be a problem later if we want switch on Unicode.
13506 * A similar issue a little bit later when switching on
13507 * namedclass. --jhi */
13509 /* If the \ is escaping white space when white space is being
13510 * skipped, it means that that white space is wanted literally, and
13511 * is already in 'value'. Otherwise, need to translate the escape
13512 * into what it signifies. */
13513 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13515 case 'w': namedclass = ANYOF_WORDCHAR; break;
13516 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13517 case 's': namedclass = ANYOF_SPACE; break;
13518 case 'S': namedclass = ANYOF_NSPACE; break;
13519 case 'd': namedclass = ANYOF_DIGIT; break;
13520 case 'D': namedclass = ANYOF_NDIGIT; break;
13521 case 'v': namedclass = ANYOF_VERTWS; break;
13522 case 'V': namedclass = ANYOF_NVERTWS; break;
13523 case 'h': namedclass = ANYOF_HORIZWS; break;
13524 case 'H': namedclass = ANYOF_NHORIZWS; break;
13525 case 'N': /* Handle \N{NAME} in class */
13527 /* We only pay attention to the first char of
13528 multichar strings being returned. I kinda wonder
13529 if this makes sense as it does change the behaviour
13530 from earlier versions, OTOH that behaviour was broken
13532 if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
13533 TRUE, /* => charclass */
13536 if (*flagp & RESTART_UTF8)
13537 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13547 /* We will handle any undefined properties ourselves */
13548 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13549 /* And we actually would prefer to get
13550 * the straight inversion list of the
13551 * swash, since we will be accessing it
13552 * anyway, to save a little time */
13553 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13555 if (RExC_parse >= RExC_end)
13556 vFAIL2("Empty \\%c{}", (U8)value);
13557 if (*RExC_parse == '{') {
13558 const U8 c = (U8)value;
13559 e = strchr(RExC_parse++, '}');
13561 vFAIL2("Missing right brace on \\%c{}", c);
13562 while (isSPACE(*RExC_parse))
13564 if (e == RExC_parse)
13565 vFAIL2("Empty \\%c{}", c);
13566 n = e - RExC_parse;
13567 while (isSPACE(*(RExC_parse + n - 1)))
13578 if (UCHARAT(RExC_parse) == '^') {
13581 /* toggle. (The rhs xor gets the single bit that
13582 * differs between P and p; the other xor inverts just
13584 value ^= 'P' ^ 'p';
13586 while (isSPACE(*RExC_parse)) {
13591 /* Try to get the definition of the property into
13592 * <invlist>. If /i is in effect, the effective property
13593 * will have its name be <__NAME_i>. The design is
13594 * discussed in commit
13595 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13596 name = savepv(Perl_form(aTHX_
13598 (FOLD) ? "__" : "",
13604 /* Look up the property name, and get its swash and
13605 * inversion list, if the property is found */
13607 SvREFCNT_dec_NN(swash);
13609 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13612 NULL, /* No inversion list */
13615 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13616 HV* curpkg = (IN_PERL_COMPILETIME)
13618 : CopSTASH(PL_curcop);
13620 SvREFCNT_dec_NN(swash);
13624 /* Here didn't find it. It could be a user-defined
13625 * property that will be available at run-time. If we
13626 * accept only compile-time properties, is an error;
13627 * otherwise add it to the list for run-time look up */
13629 RExC_parse = e + 1;
13631 "Property '%"UTF8f"' is unknown",
13632 UTF8fARG(UTF, n, name));
13635 /* If the property name doesn't already have a package
13636 * name, add the current one to it so that it can be
13637 * referred to outside it. [perl #121777] */
13638 if (curpkg && ! instr(name, "::")) {
13639 char* pkgname = HvNAME(curpkg);
13640 if (strNE(pkgname, "main")) {
13641 char* full_name = Perl_form(aTHX_
13645 n = strlen(full_name);
13647 name = savepvn(full_name, n);
13650 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13651 (value == 'p' ? '+' : '!'),
13652 UTF8fARG(UTF, n, name));
13653 has_user_defined_property = TRUE;
13655 /* We don't know yet, so have to assume that the
13656 * property could match something in the Latin1 range,
13657 * hence something that isn't utf8. Note that this
13658 * would cause things in <depends_list> to match
13659 * inappropriately, except that any \p{}, including
13660 * this one forces Unicode semantics, which means there
13661 * is no <depends_list> */
13662 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
13666 /* Here, did get the swash and its inversion list. If
13667 * the swash is from a user-defined property, then this
13668 * whole character class should be regarded as such */
13669 if (swash_init_flags
13670 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13672 has_user_defined_property = TRUE;
13675 /* We warn on matching an above-Unicode code point
13676 * if the match would return true, except don't
13677 * warn for \p{All}, which has exactly one element
13679 (_invlist_contains_cp(invlist, 0x110000)
13680 && (! (_invlist_len(invlist) == 1
13681 && *invlist_array(invlist) == 0)))
13687 /* Invert if asking for the complement */
13688 if (value == 'P') {
13689 _invlist_union_complement_2nd(properties,
13693 /* The swash can't be used as-is, because we've
13694 * inverted things; delay removing it to here after
13695 * have copied its invlist above */
13696 SvREFCNT_dec_NN(swash);
13700 _invlist_union(properties, invlist, &properties);
13705 RExC_parse = e + 1;
13706 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13709 /* \p means they want Unicode semantics */
13710 RExC_uni_semantics = 1;
13713 case 'n': value = '\n'; break;
13714 case 'r': value = '\r'; break;
13715 case 't': value = '\t'; break;
13716 case 'f': value = '\f'; break;
13717 case 'b': value = '\b'; break;
13718 case 'e': value = ASCII_TO_NATIVE('\033');break;
13719 case 'a': value = '\a'; break;
13721 RExC_parse--; /* function expects to be pointed at the 'o' */
13723 const char* error_msg;
13724 bool valid = grok_bslash_o(&RExC_parse,
13727 SIZE_ONLY, /* warnings in pass
13730 silence_non_portable,
13736 if (PL_encoding && value < 0x100) {
13737 goto recode_encoding;
13741 RExC_parse--; /* function expects to be pointed at the 'x' */
13743 const char* error_msg;
13744 bool valid = grok_bslash_x(&RExC_parse,
13747 TRUE, /* Output warnings */
13749 silence_non_portable,
13755 if (PL_encoding && value < 0x100)
13756 goto recode_encoding;
13759 value = grok_bslash_c(*RExC_parse++, SIZE_ONLY);
13761 case '0': case '1': case '2': case '3': case '4':
13762 case '5': case '6': case '7':
13764 /* Take 1-3 octal digits */
13765 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13766 numlen = (strict) ? 4 : 3;
13767 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13768 RExC_parse += numlen;
13771 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13772 vFAIL("Need exactly 3 octal digits");
13774 else if (! SIZE_ONLY /* like \08, \178 */
13776 && RExC_parse < RExC_end
13777 && isDIGIT(*RExC_parse)
13778 && ckWARN(WARN_REGEXP))
13780 SAVEFREESV(RExC_rx_sv);
13781 reg_warn_non_literal_string(
13783 form_short_octal_warning(RExC_parse, numlen));
13784 (void)ReREFCNT_inc(RExC_rx_sv);
13787 if (PL_encoding && value < 0x100)
13788 goto recode_encoding;
13792 if (! RExC_override_recoding) {
13793 SV* enc = PL_encoding;
13794 value = reg_recode((const char)(U8)value, &enc);
13797 vFAIL("Invalid escape in the specified encoding");
13799 else if (SIZE_ONLY) {
13800 ckWARNreg(RExC_parse,
13801 "Invalid escape in the specified encoding");
13807 /* Allow \_ to not give an error */
13808 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13810 vFAIL2("Unrecognized escape \\%c in character class",
13814 SAVEFREESV(RExC_rx_sv);
13815 ckWARN2reg(RExC_parse,
13816 "Unrecognized escape \\%c in character class passed through",
13818 (void)ReREFCNT_inc(RExC_rx_sv);
13822 } /* End of switch on char following backslash */
13823 } /* end of handling backslash escape sequences */
13826 literal_endpoint++;
13829 /* Here, we have the current token in 'value' */
13831 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13834 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13835 * literal, as is the character that began the false range, i.e.
13836 * the 'a' in the examples */
13839 const int w = (RExC_parse >= rangebegin)
13840 ? RExC_parse - rangebegin
13844 "False [] range \"%"UTF8f"\"",
13845 UTF8fARG(UTF, w, rangebegin));
13848 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13849 ckWARN2reg(RExC_parse,
13850 "False [] range \"%"UTF8f"\"",
13851 UTF8fARG(UTF, w, rangebegin));
13852 (void)ReREFCNT_inc(RExC_rx_sv);
13853 cp_list = add_cp_to_invlist(cp_list, '-');
13854 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
13859 range = 0; /* this was not a true range */
13860 element_count += 2; /* So counts for three values */
13863 classnum = namedclass_to_classnum(namedclass);
13865 if (LOC && namedclass < ANYOF_POSIXL_MAX
13866 #ifndef HAS_ISASCII
13867 && classnum != _CC_ASCII
13870 /* What the Posix classes (like \w, [:space:]) match in locale
13871 * isn't knowable under locale until actual match time. Room
13872 * must be reserved (one time per outer bracketed class) to
13873 * store such classes. The space will contain a bit for each
13874 * named class that is to be matched against. This isn't
13875 * needed for \p{} and pseudo-classes, as they are not affected
13876 * by locale, and hence are dealt with separately */
13877 if (! need_class) {
13880 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13883 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
13885 ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
13886 ANYOF_POSIXL_ZERO(ret);
13889 /* Coverity thinks it is possible for this to be negative; both
13890 * jhi and khw think it's not, but be safer */
13891 assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13892 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
13894 /* See if it already matches the complement of this POSIX
13896 if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
13897 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
13901 posixl_matches_all = TRUE;
13902 break; /* No need to continue. Since it matches both
13903 e.g., \w and \W, it matches everything, and the
13904 bracketed class can be optimized into qr/./s */
13907 /* Add this class to those that should be checked at runtime */
13908 ANYOF_POSIXL_SET(ret, namedclass);
13910 /* The above-Latin1 characters are not subject to locale rules.
13911 * Just add them, in the second pass, to the
13912 * unconditionally-matched list */
13914 SV* scratch_list = NULL;
13916 /* Get the list of the above-Latin1 code points this
13918 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
13919 PL_XPosix_ptrs[classnum],
13921 /* Odd numbers are complements, like
13922 * NDIGIT, NASCII, ... */
13923 namedclass % 2 != 0,
13925 /* Checking if 'cp_list' is NULL first saves an extra
13926 * clone. Its reference count will be decremented at the
13927 * next union, etc, or if this is the only instance, at the
13928 * end of the routine */
13930 cp_list = scratch_list;
13933 _invlist_union(cp_list, scratch_list, &cp_list);
13934 SvREFCNT_dec_NN(scratch_list);
13936 continue; /* Go get next character */
13939 else if (! SIZE_ONLY) {
13941 /* Here, not in pass1 (in that pass we skip calculating the
13942 * contents of this class), and is /l, or is a POSIX class for
13943 * which /l doesn't matter (or is a Unicode property, which is
13944 * skipped here). */
13945 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
13946 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
13948 /* Here, should be \h, \H, \v, or \V. None of /d, /i
13949 * nor /l make a difference in what these match,
13950 * therefore we just add what they match to cp_list. */
13951 if (classnum != _CC_VERTSPACE) {
13952 assert( namedclass == ANYOF_HORIZWS
13953 || namedclass == ANYOF_NHORIZWS);
13955 /* It turns out that \h is just a synonym for
13957 classnum = _CC_BLANK;
13960 _invlist_union_maybe_complement_2nd(
13962 PL_XPosix_ptrs[classnum],
13963 namedclass % 2 != 0, /* Complement if odd
13964 (NHORIZWS, NVERTWS)
13969 else { /* Garden variety class. If is NASCII, NDIGIT, ...
13970 complement and use nposixes */
13971 SV** posixes_ptr = namedclass % 2 == 0
13974 SV** source_ptr = &PL_XPosix_ptrs[classnum];
13975 _invlist_union_maybe_complement_2nd(
13978 namedclass % 2 != 0,
13981 continue; /* Go get next character */
13983 } /* end of namedclass \blah */
13985 /* Here, we have a single value. If 'range' is set, it is the ending
13986 * of a range--check its validity. Later, we will handle each
13987 * individual code point in the range. If 'range' isn't set, this
13988 * could be the beginning of a range, so check for that by looking
13989 * ahead to see if the next real character to be processed is the range
13990 * indicator--the minus sign */
13993 RExC_parse = regpatws(pRExC_state, RExC_parse,
13994 FALSE /* means don't recognize comments */ );
13998 if (prevvalue > value) /* b-a */ {
13999 const int w = RExC_parse - rangebegin;
14001 "Invalid [] range \"%"UTF8f"\"",
14002 UTF8fARG(UTF, w, rangebegin));
14003 range = 0; /* not a valid range */
14007 prevvalue = value; /* save the beginning of the potential range */
14008 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14009 && *RExC_parse == '-')
14011 char* next_char_ptr = RExC_parse + 1;
14012 if (skip_white) { /* Get the next real char after the '-' */
14013 next_char_ptr = regpatws(pRExC_state,
14015 FALSE); /* means don't recognize
14019 /* If the '-' is at the end of the class (just before the ']',
14020 * it is a literal minus; otherwise it is a range */
14021 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14022 RExC_parse = next_char_ptr;
14024 /* a bad range like \w-, [:word:]- ? */
14025 if (namedclass > OOB_NAMEDCLASS) {
14026 if (strict || ckWARN(WARN_REGEXP)) {
14028 RExC_parse >= rangebegin ?
14029 RExC_parse - rangebegin : 0;
14031 vFAIL4("False [] range \"%*.*s\"",
14036 "False [] range \"%*.*s\"",
14041 cp_list = add_cp_to_invlist(cp_list, '-');
14045 range = 1; /* yeah, it's a range! */
14046 continue; /* but do it the next time */
14051 /* Here, <prevvalue> is the beginning of the range, if any; or <value>
14054 /* non-Latin1 code point implies unicode semantics. Must be set in
14055 * pass1 so is there for the whole of pass 2 */
14057 RExC_uni_semantics = 1;
14060 /* Ready to process either the single value, or the completed range.
14061 * For single-valued non-inverted ranges, we consider the possibility
14062 * of multi-char folds. (We made a conscious decision to not do this
14063 * for the other cases because it can often lead to non-intuitive
14064 * results. For example, you have the peculiar case that:
14065 * "s s" =~ /^[^\xDF]+$/i => Y
14066 * "ss" =~ /^[^\xDF]+$/i => N
14068 * See [perl #89750] */
14069 if (FOLD && allow_multi_folds && value == prevvalue) {
14070 if (value == LATIN_SMALL_LETTER_SHARP_S
14071 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14074 /* Here <value> is indeed a multi-char fold. Get what it is */
14076 U8 foldbuf[UTF8_MAXBYTES_CASE];
14079 UV folded = _to_uni_fold_flags(
14083 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14084 ? FOLD_FLAGS_NOMIX_ASCII
14088 /* Here, <folded> should be the first character of the
14089 * multi-char fold of <value>, with <foldbuf> containing the
14090 * whole thing. But, if this fold is not allowed (because of
14091 * the flags), <fold> will be the same as <value>, and should
14092 * be processed like any other character, so skip the special
14094 if (folded != value) {
14096 /* Skip if we are recursed, currently parsing the class
14097 * again. Otherwise add this character to the list of
14098 * multi-char folds. */
14099 if (! RExC_in_multi_char_class) {
14100 AV** this_array_ptr;
14102 STRLEN cp_count = utf8_length(foldbuf,
14103 foldbuf + foldlen);
14104 SV* multi_fold = sv_2mortal(newSVpvs(""));
14106 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14109 if (! multi_char_matches) {
14110 multi_char_matches = newAV();
14113 /* <multi_char_matches> is actually an array of arrays.
14114 * There will be one or two top-level elements: [2],
14115 * and/or [3]. The [2] element is an array, each
14116 * element thereof is a character which folds to TWO
14117 * characters; [3] is for folds to THREE characters.
14118 * (Unicode guarantees a maximum of 3 characters in any
14119 * fold.) When we rewrite the character class below,
14120 * we will do so such that the longest folds are
14121 * written first, so that it prefers the longest
14122 * matching strings first. This is done even if it
14123 * turns out that any quantifier is non-greedy, out of
14124 * programmer laziness. Tom Christiansen has agreed
14125 * that this is ok. This makes the test for the
14126 * ligature 'ffi' come before the test for 'ff' */
14127 if (av_exists(multi_char_matches, cp_count)) {
14128 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14130 this_array = *this_array_ptr;
14133 this_array = newAV();
14134 av_store(multi_char_matches, cp_count,
14137 av_push(this_array, multi_fold);
14140 /* This element should not be processed further in this
14143 value = save_value;
14144 prevvalue = save_prevvalue;
14150 /* Deal with this element of the class */
14153 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14156 SV* this_range = _new_invlist(1);
14157 _append_range_to_invlist(this_range, prevvalue, value);
14159 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14160 * If this range was specified using something like 'i-j', we want
14161 * to include only the 'i' and the 'j', and not anything in
14162 * between, so exclude non-ASCII, non-alphabetics from it.
14163 * However, if the range was specified with something like
14164 * [\x89-\x91] or [\x89-j], all code points within it should be
14165 * included. literal_endpoint==2 means both ends of the range used
14166 * a literal character, not \x{foo} */
14167 if (literal_endpoint == 2
14168 && ((prevvalue >= 'a' && value <= 'z')
14169 || (prevvalue >= 'A' && value <= 'Z')))
14171 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14174 /* Since this above only contains ascii, the intersection of it
14175 * with anything will still yield only ascii */
14176 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14179 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14180 literal_endpoint = 0;
14184 range = 0; /* this range (if it was one) is done now */
14185 } /* End of loop through all the text within the brackets */
14187 /* If anything in the class expands to more than one character, we have to
14188 * deal with them by building up a substitute parse string, and recursively
14189 * calling reg() on it, instead of proceeding */
14190 if (multi_char_matches) {
14191 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14194 char *save_end = RExC_end;
14195 char *save_parse = RExC_parse;
14196 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14201 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14202 because too confusing */
14204 sv_catpv(substitute_parse, "(?:");
14208 /* Look at the longest folds first */
14209 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14211 if (av_exists(multi_char_matches, cp_count)) {
14212 AV** this_array_ptr;
14215 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14217 while ((this_sequence = av_pop(*this_array_ptr)) !=
14220 if (! first_time) {
14221 sv_catpv(substitute_parse, "|");
14223 first_time = FALSE;
14225 sv_catpv(substitute_parse, SvPVX(this_sequence));
14230 /* If the character class contains anything else besides these
14231 * multi-character folds, have to include it in recursive parsing */
14232 if (element_count) {
14233 sv_catpv(substitute_parse, "|[");
14234 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14235 sv_catpv(substitute_parse, "]");
14238 sv_catpv(substitute_parse, ")");
14241 /* This is a way to get the parse to skip forward a whole named
14242 * sequence instead of matching the 2nd character when it fails the
14244 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14248 RExC_parse = SvPV(substitute_parse, len);
14249 RExC_end = RExC_parse + len;
14250 RExC_in_multi_char_class = 1;
14251 RExC_emit = (regnode *)orig_emit;
14253 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14255 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14257 RExC_parse = save_parse;
14258 RExC_end = save_end;
14259 RExC_in_multi_char_class = 0;
14260 SvREFCNT_dec_NN(multi_char_matches);
14264 /* Here, we've gone through the entire class and dealt with multi-char
14265 * folds. We are now in a position that we can do some checks to see if we
14266 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14267 * Currently we only do two checks:
14268 * 1) is in the unlikely event that the user has specified both, eg. \w and
14269 * \W under /l, then the class matches everything. (This optimization
14270 * is done only to make the optimizer code run later work.)
14271 * 2) if the character class contains only a single element (including a
14272 * single range), we see if there is an equivalent node for it.
14273 * Other checks are possible */
14274 if (! ret_invlist /* Can't optimize if returning the constructed
14276 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14281 if (UNLIKELY(posixl_matches_all)) {
14284 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14285 \w or [:digit:] or \p{foo}
14288 /* All named classes are mapped into POSIXish nodes, with its FLAG
14289 * argument giving which class it is */
14290 switch ((I32)namedclass) {
14291 case ANYOF_UNIPROP:
14294 /* These don't depend on the charset modifiers. They always
14295 * match under /u rules */
14296 case ANYOF_NHORIZWS:
14297 case ANYOF_HORIZWS:
14298 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14301 case ANYOF_NVERTWS:
14306 /* The actual POSIXish node for all the rest depends on the
14307 * charset modifier. The ones in the first set depend only on
14308 * ASCII or, if available on this platform, locale */
14312 op = (LOC) ? POSIXL : POSIXA;
14323 /* under /a could be alpha */
14325 if (ASCII_RESTRICTED) {
14326 namedclass = ANYOF_ALPHA + (namedclass % 2);
14334 /* The rest have more possibilities depending on the charset.
14335 * We take advantage of the enum ordering of the charset
14336 * modifiers to get the exact node type, */
14338 op = POSIXD + get_regex_charset(RExC_flags);
14339 if (op > POSIXA) { /* /aa is same as /a */
14344 /* The odd numbered ones are the complements of the
14345 * next-lower even number one */
14346 if (namedclass % 2 == 1) {
14350 arg = namedclass_to_classnum(namedclass);
14354 else if (value == prevvalue) {
14356 /* Here, the class consists of just a single code point */
14359 if (! LOC && value == '\n') {
14360 op = REG_ANY; /* Optimize [^\n] */
14361 *flagp |= HASWIDTH|SIMPLE;
14365 else if (value < 256 || UTF) {
14367 /* Optimize a single value into an EXACTish node, but not if it
14368 * would require converting the pattern to UTF-8. */
14369 op = compute_EXACTish(pRExC_state);
14371 } /* Otherwise is a range */
14372 else if (! LOC) { /* locale could vary these */
14373 if (prevvalue == '0') {
14374 if (value == '9') {
14379 else if (prevvalue == 'A') {
14382 && literal_endpoint == 2
14385 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14389 else if (prevvalue == 'a') {
14392 && literal_endpoint == 2
14395 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14401 /* Here, we have changed <op> away from its initial value iff we found
14402 * an optimization */
14405 /* Throw away this ANYOF regnode, and emit the calculated one,
14406 * which should correspond to the beginning, not current, state of
14408 const char * cur_parse = RExC_parse;
14409 RExC_parse = (char *)orig_parse;
14413 /* To get locale nodes to not use the full ANYOF size would
14414 * require moving the code above that writes the portions
14415 * of it that aren't in other nodes to after this point.
14416 * e.g. ANYOF_POSIXL_SET */
14417 RExC_size = orig_size;
14421 RExC_emit = (regnode *)orig_emit;
14422 if (PL_regkind[op] == POSIXD) {
14423 if (op == POSIXL) {
14424 RExC_contains_locale = 1;
14427 op += NPOSIXD - POSIXD;
14432 ret = reg_node(pRExC_state, op);
14434 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14438 *flagp |= HASWIDTH|SIMPLE;
14440 else if (PL_regkind[op] == EXACT) {
14441 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14442 TRUE /* downgradable to EXACT */
14446 RExC_parse = (char *) cur_parse;
14448 SvREFCNT_dec(posixes);
14449 SvREFCNT_dec(nposixes);
14450 SvREFCNT_dec(cp_list);
14451 SvREFCNT_dec(cp_foldable_list);
14458 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14460 /* If folding, we calculate all characters that could fold to or from the
14461 * ones already on the list */
14462 if (cp_foldable_list) {
14464 UV start, end; /* End points of code point ranges */
14466 SV* fold_intersection = NULL;
14469 /* Our calculated list will be for Unicode rules. For locale
14470 * matching, we have to keep a separate list that is consulted at
14471 * runtime only when the locale indicates Unicode rules. For
14472 * non-locale, we just use to the general list */
14474 use_list = &only_utf8_locale_list;
14477 use_list = &cp_list;
14480 /* Only the characters in this class that participate in folds need
14481 * be checked. Get the intersection of this class and all the
14482 * possible characters that are foldable. This can quickly narrow
14483 * down a large class */
14484 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14485 &fold_intersection);
14487 /* The folds for all the Latin1 characters are hard-coded into this
14488 * program, but we have to go out to disk to get the others. */
14489 if (invlist_highest(cp_foldable_list) >= 256) {
14491 /* This is a hash that for a particular fold gives all
14492 * characters that are involved in it */
14493 if (! PL_utf8_foldclosures) {
14494 _load_PL_utf8_foldclosures();
14498 /* Now look at the foldable characters in this class individually */
14499 invlist_iterinit(fold_intersection);
14500 while (invlist_iternext(fold_intersection, &start, &end)) {
14503 /* Look at every character in the range */
14504 for (j = start; j <= end; j++) {
14505 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14511 if (IS_IN_SOME_FOLD_L1(j)) {
14513 /* ASCII is always matched; non-ASCII is matched
14514 * only under Unicode rules (which could happen
14515 * under /l if the locale is a UTF-8 one */
14516 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14517 *use_list = add_cp_to_invlist(*use_list,
14518 PL_fold_latin1[j]);
14522 add_cp_to_invlist(depends_list,
14523 PL_fold_latin1[j]);
14527 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14528 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14530 add_above_Latin1_folds(pRExC_state,
14537 /* Here is an above Latin1 character. We don't have the
14538 * rules hard-coded for it. First, get its fold. This is
14539 * the simple fold, as the multi-character folds have been
14540 * handled earlier and separated out */
14541 _to_uni_fold_flags(j, foldbuf, &foldlen,
14542 (ASCII_FOLD_RESTRICTED)
14543 ? FOLD_FLAGS_NOMIX_ASCII
14546 /* Single character fold of above Latin1. Add everything in
14547 * its fold closure to the list that this node should match.
14548 * The fold closures data structure is a hash with the keys
14549 * being the UTF-8 of every character that is folded to, like
14550 * 'k', and the values each an array of all code points that
14551 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14552 * Multi-character folds are not included */
14553 if ((listp = hv_fetch(PL_utf8_foldclosures,
14554 (char *) foldbuf, foldlen, FALSE)))
14556 AV* list = (AV*) *listp;
14558 for (k = 0; k <= av_tindex(list); k++) {
14559 SV** c_p = av_fetch(list, k, FALSE);
14565 /* /aa doesn't allow folds between ASCII and non- */
14566 if ((ASCII_FOLD_RESTRICTED
14567 && (isASCII(c) != isASCII(j))))
14572 /* Folds under /l which cross the 255/256 boundary
14573 * are added to a separate list. (These are valid
14574 * only when the locale is UTF-8.) */
14575 if (c < 256 && LOC) {
14576 *use_list = add_cp_to_invlist(*use_list, c);
14580 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14582 cp_list = add_cp_to_invlist(cp_list, c);
14585 /* Similarly folds involving non-ascii Latin1
14586 * characters under /d are added to their list */
14587 depends_list = add_cp_to_invlist(depends_list,
14594 SvREFCNT_dec_NN(fold_intersection);
14597 /* Now that we have finished adding all the folds, there is no reason
14598 * to keep the foldable list separate */
14599 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14600 SvREFCNT_dec_NN(cp_foldable_list);
14603 /* And combine the result (if any) with any inversion list from posix
14604 * classes. The lists are kept separate up to now because we don't want to
14605 * fold the classes (folding of those is automatically handled by the swash
14606 * fetching code) */
14607 if (posixes || nposixes) {
14608 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14609 /* Under /a and /aa, nothing above ASCII matches these */
14610 _invlist_intersection(posixes,
14611 PL_XPosix_ptrs[_CC_ASCII],
14615 if (DEPENDS_SEMANTICS) {
14616 /* Under /d, everything in the upper half of the Latin1 range
14617 * matches these complements */
14618 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL;
14620 else if (AT_LEAST_ASCII_RESTRICTED) {
14621 /* Under /a and /aa, everything above ASCII matches these
14623 _invlist_union_complement_2nd(nposixes,
14624 PL_XPosix_ptrs[_CC_ASCII],
14628 _invlist_union(posixes, nposixes, &posixes);
14629 SvREFCNT_dec_NN(nposixes);
14632 posixes = nposixes;
14635 if (! DEPENDS_SEMANTICS) {
14637 _invlist_union(cp_list, posixes, &cp_list);
14638 SvREFCNT_dec_NN(posixes);
14645 /* Under /d, we put into a separate list the Latin1 things that
14646 * match only when the target string is utf8 */
14647 SV* nonascii_but_latin1_properties = NULL;
14648 _invlist_intersection(posixes, PL_UpperLatin1,
14649 &nonascii_but_latin1_properties);
14650 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14653 _invlist_union(cp_list, posixes, &cp_list);
14654 SvREFCNT_dec_NN(posixes);
14660 if (depends_list) {
14661 _invlist_union(depends_list, nonascii_but_latin1_properties,
14663 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14666 depends_list = nonascii_but_latin1_properties;
14671 /* And combine the result (if any) with any inversion list from properties.
14672 * The lists are kept separate up to now so that we can distinguish the two
14673 * in regards to matching above-Unicode. A run-time warning is generated
14674 * if a Unicode property is matched against a non-Unicode code point. But,
14675 * we allow user-defined properties to match anything, without any warning,
14676 * and we also suppress the warning if there is a portion of the character
14677 * class that isn't a Unicode property, and which matches above Unicode, \W
14678 * or [\x{110000}] for example.
14679 * (Note that in this case, unlike the Posix one above, there is no
14680 * <depends_list>, because having a Unicode property forces Unicode
14685 /* If it matters to the final outcome, see if a non-property
14686 * component of the class matches above Unicode. If so, the
14687 * warning gets suppressed. This is true even if just a single
14688 * such code point is specified, as though not strictly correct if
14689 * another such code point is matched against, the fact that they
14690 * are using above-Unicode code points indicates they should know
14691 * the issues involved */
14693 warn_super = ! (invert
14694 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14697 _invlist_union(properties, cp_list, &cp_list);
14698 SvREFCNT_dec_NN(properties);
14701 cp_list = properties;
14705 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14709 /* Here, we have calculated what code points should be in the character
14712 * Now we can see about various optimizations. Fold calculation (which we
14713 * did above) needs to take place before inversion. Otherwise /[^k]/i
14714 * would invert to include K, which under /i would match k, which it
14715 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14716 * folded until runtime */
14718 /* If we didn't do folding, it's because some information isn't available
14719 * until runtime; set the run-time fold flag for these. (We don't have to
14720 * worry about properties folding, as that is taken care of by the swash
14721 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14722 * locales, or the class matches at least one 0-255 range code point */
14724 if (only_utf8_locale_list) {
14725 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14727 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14730 invlist_iterinit(cp_list);
14731 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14732 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14734 invlist_iterfinish(cp_list);
14738 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14739 * at compile time. Besides not inverting folded locale now, we can't
14740 * invert if there are things such as \w, which aren't known until runtime
14744 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14746 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14748 _invlist_invert(cp_list);
14750 /* Any swash can't be used as-is, because we've inverted things */
14752 SvREFCNT_dec_NN(swash);
14756 /* Clear the invert flag since have just done it here */
14761 *ret_invlist = cp_list;
14762 SvREFCNT_dec(swash);
14764 /* Discard the generated node */
14766 RExC_size = orig_size;
14769 RExC_emit = orig_emit;
14774 /* Some character classes are equivalent to other nodes. Such nodes take
14775 * up less room and generally fewer operations to execute than ANYOF nodes.
14776 * Above, we checked for and optimized into some such equivalents for
14777 * certain common classes that are easy to test. Getting to this point in
14778 * the code means that the class didn't get optimized there. Since this
14779 * code is only executed in Pass 2, it is too late to save space--it has
14780 * been allocated in Pass 1, and currently isn't given back. But turning
14781 * things into an EXACTish node can allow the optimizer to join it to any
14782 * adjacent such nodes. And if the class is equivalent to things like /./,
14783 * expensive run-time swashes can be avoided. Now that we have more
14784 * complete information, we can find things necessarily missed by the
14785 * earlier code. I (khw) am not sure how much to look for here. It would
14786 * be easy, but perhaps too slow, to check any candidates against all the
14787 * node types they could possibly match using _invlistEQ(). */
14792 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14793 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14795 /* We don't optimize if we are supposed to make sure all non-Unicode
14796 * code points raise a warning, as only ANYOF nodes have this check.
14798 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14801 U8 op = END; /* The optimzation node-type */
14802 const char * cur_parse= RExC_parse;
14804 invlist_iterinit(cp_list);
14805 if (! invlist_iternext(cp_list, &start, &end)) {
14807 /* Here, the list is empty. This happens, for example, when a
14808 * Unicode property is the only thing in the character class, and
14809 * it doesn't match anything. (perluniprops.pod notes such
14812 *flagp |= HASWIDTH|SIMPLE;
14814 else if (start == end) { /* The range is a single code point */
14815 if (! invlist_iternext(cp_list, &start, &end)
14817 /* Don't do this optimization if it would require changing
14818 * the pattern to UTF-8 */
14819 && (start < 256 || UTF))
14821 /* Here, the list contains a single code point. Can optimize
14822 * into an EXACTish node */
14831 /* A locale node under folding with one code point can be
14832 * an EXACTFL, as its fold won't be calculated until
14838 /* Here, we are generally folding, but there is only one
14839 * code point to match. If we have to, we use an EXACT
14840 * node, but it would be better for joining with adjacent
14841 * nodes in the optimization pass if we used the same
14842 * EXACTFish node that any such are likely to be. We can
14843 * do this iff the code point doesn't participate in any
14844 * folds. For example, an EXACTF of a colon is the same as
14845 * an EXACT one, since nothing folds to or from a colon. */
14847 if (IS_IN_SOME_FOLD_L1(value)) {
14852 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14857 /* If we haven't found the node type, above, it means we
14858 * can use the prevailing one */
14860 op = compute_EXACTish(pRExC_state);
14865 else if (start == 0) {
14866 if (end == UV_MAX) {
14868 *flagp |= HASWIDTH|SIMPLE;
14871 else if (end == '\n' - 1
14872 && invlist_iternext(cp_list, &start, &end)
14873 && start == '\n' + 1 && end == UV_MAX)
14876 *flagp |= HASWIDTH|SIMPLE;
14880 invlist_iterfinish(cp_list);
14883 RExC_parse = (char *)orig_parse;
14884 RExC_emit = (regnode *)orig_emit;
14886 ret = reg_node(pRExC_state, op);
14888 RExC_parse = (char *)cur_parse;
14890 if (PL_regkind[op] == EXACT) {
14891 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14892 TRUE /* downgradable to EXACT */
14896 SvREFCNT_dec_NN(cp_list);
14901 /* Here, <cp_list> contains all the code points we can determine at
14902 * compile time that match under all conditions. Go through it, and
14903 * for things that belong in the bitmap, put them there, and delete from
14904 * <cp_list>. While we are at it, see if everything above 255 is in the
14905 * list, and if so, set a flag to speed up execution */
14907 populate_ANYOF_from_invlist(ret, &cp_list);
14910 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
14913 /* Here, the bitmap has been populated with all the Latin1 code points that
14914 * always match. Can now add to the overall list those that match only
14915 * when the target string is UTF-8 (<depends_list>). */
14916 if (depends_list) {
14918 _invlist_union(cp_list, depends_list, &cp_list);
14919 SvREFCNT_dec_NN(depends_list);
14922 cp_list = depends_list;
14924 ANYOF_FLAGS(ret) |= ANYOF_UTF8;
14927 /* If there is a swash and more than one element, we can't use the swash in
14928 * the optimization below. */
14929 if (swash && element_count > 1) {
14930 SvREFCNT_dec_NN(swash);
14934 set_ANYOF_arg(pRExC_state, ret, cp_list,
14935 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14937 only_utf8_locale_list,
14938 swash, has_user_defined_property);
14940 *flagp |= HASWIDTH|SIMPLE;
14942 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
14943 RExC_contains_locale = 1;
14949 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14952 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
14953 regnode* const node,
14955 SV* const runtime_defns,
14956 SV* const only_utf8_locale_list,
14958 const bool has_user_defined_property)
14960 /* Sets the arg field of an ANYOF-type node 'node', using information about
14961 * the node passed-in. If there is nothing outside the node's bitmap, the
14962 * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to
14963 * the count returned by add_data(), having allocated and stored an array,
14964 * av, that that count references, as follows:
14965 * av[0] stores the character class description in its textual form.
14966 * This is used later (regexec.c:Perl_regclass_swash()) to
14967 * initialize the appropriate swash, and is also useful for dumping
14968 * the regnode. This is set to &PL_sv_undef if the textual
14969 * description is not needed at run-time (as happens if the other
14970 * elements completely define the class)
14971 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
14972 * computed from av[0]. But if no further computation need be done,
14973 * the swash is stored here now (and av[0] is &PL_sv_undef).
14974 * av[2] stores the inversion list of code points that match only if the
14975 * current locale is UTF-8
14976 * av[3] stores the cp_list inversion list for use in addition or instead
14977 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
14978 * (Otherwise everything needed is already in av[0] and av[1])
14979 * av[4] is set if any component of the class is from a user-defined
14980 * property; used only if av[3] exists */
14984 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
14986 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
14987 assert(! (ANYOF_FLAGS(node)
14988 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
14989 ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
14992 AV * const av = newAV();
14995 assert(ANYOF_FLAGS(node)
14996 & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
14998 av_store(av, 0, (runtime_defns)
14999 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15002 av_store(av, 1, swash);
15003 SvREFCNT_dec_NN(cp_list);
15006 av_store(av, 1, &PL_sv_undef);
15008 av_store(av, 3, cp_list);
15009 av_store(av, 4, newSVuv(has_user_defined_property));
15013 if (only_utf8_locale_list) {
15014 av_store(av, 2, only_utf8_locale_list);
15017 av_store(av, 2, &PL_sv_undef);
15020 rv = newRV_noinc(MUTABLE_SV(av));
15021 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15022 RExC_rxi->data->data[n] = (void*)rv;
15028 /* reg_skipcomment()
15030 Absorbs an /x style # comment from the input stream,
15031 returning a pointer to the first character beyond the comment, or if the
15032 comment terminates the pattern without anything following it, this returns
15033 one past the final character of the pattern (in other words, RExC_end) and
15034 sets the REG_RUN_ON_COMMENT_SEEN flag.
15036 Note it's the callers responsibility to ensure that we are
15037 actually in /x mode
15041 PERL_STATIC_INLINE char*
15042 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15044 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15048 while (p < RExC_end) {
15049 if (*(++p) == '\n') {
15054 /* we ran off the end of the pattern without ending the comment, so we have
15055 * to add an \n when wrapping */
15056 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15062 Advances the parse position, and optionally absorbs
15063 "whitespace" from the inputstream.
15065 Without /x "whitespace" means (?#...) style comments only,
15066 with /x this means (?#...) and # comments and whitespace proper.
15068 Returns the RExC_parse point from BEFORE the scan occurs.
15070 This is the /x friendly way of saying RExC_parse++.
15074 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15076 char* const retval = RExC_parse++;
15078 PERL_ARGS_ASSERT_NEXTCHAR;
15081 if (RExC_end - RExC_parse >= 3
15082 && *RExC_parse == '('
15083 && RExC_parse[1] == '?'
15084 && RExC_parse[2] == '#')
15086 while (*RExC_parse != ')') {
15087 if (RExC_parse == RExC_end)
15088 FAIL("Sequence (?#... not terminated");
15094 if (RExC_flags & RXf_PMf_EXTENDED) {
15095 char * p = regpatws(pRExC_state, RExC_parse,
15096 TRUE); /* means recognize comments */
15097 if (p != RExC_parse) {
15107 - reg_node - emit a node
15109 STATIC regnode * /* Location. */
15110 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15114 regnode * const ret = RExC_emit;
15115 GET_RE_DEBUG_FLAGS_DECL;
15117 PERL_ARGS_ASSERT_REG_NODE;
15120 SIZE_ALIGN(RExC_size);
15124 if (RExC_emit >= RExC_emit_bound)
15125 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15126 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15128 NODE_ALIGN_FILL(ret);
15130 FILL_ADVANCE_NODE(ptr, op);
15131 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15132 #ifdef RE_TRACK_PATTERN_OFFSETS
15133 if (RExC_offsets) { /* MJD */
15135 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15136 "reg_node", __LINE__,
15138 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15139 ? "Overwriting end of array!\n" : "OK",
15140 (UV)(RExC_emit - RExC_emit_start),
15141 (UV)(RExC_parse - RExC_start),
15142 (UV)RExC_offsets[0]));
15143 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15151 - reganode - emit a node with an argument
15153 STATIC regnode * /* Location. */
15154 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15158 regnode * const ret = RExC_emit;
15159 GET_RE_DEBUG_FLAGS_DECL;
15161 PERL_ARGS_ASSERT_REGANODE;
15164 SIZE_ALIGN(RExC_size);
15169 assert(2==regarglen[op]+1);
15171 Anything larger than this has to allocate the extra amount.
15172 If we changed this to be:
15174 RExC_size += (1 + regarglen[op]);
15176 then it wouldn't matter. Its not clear what side effect
15177 might come from that so its not done so far.
15182 if (RExC_emit >= RExC_emit_bound)
15183 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15184 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15186 NODE_ALIGN_FILL(ret);
15188 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15189 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15190 #ifdef RE_TRACK_PATTERN_OFFSETS
15191 if (RExC_offsets) { /* MJD */
15193 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15197 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15198 "Overwriting end of array!\n" : "OK",
15199 (UV)(RExC_emit - RExC_emit_start),
15200 (UV)(RExC_parse - RExC_start),
15201 (UV)RExC_offsets[0]));
15202 Set_Cur_Node_Offset;
15210 - reguni - emit (if appropriate) a Unicode character
15212 PERL_STATIC_INLINE STRLEN
15213 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15217 PERL_ARGS_ASSERT_REGUNI;
15219 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15223 - reginsert - insert an operator in front of already-emitted operand
15225 * Means relocating the operand.
15228 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15234 const int offset = regarglen[(U8)op];
15235 const int size = NODE_STEP_REGNODE + offset;
15236 GET_RE_DEBUG_FLAGS_DECL;
15238 PERL_ARGS_ASSERT_REGINSERT;
15239 PERL_UNUSED_CONTEXT;
15240 PERL_UNUSED_ARG(depth);
15241 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15242 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15251 if (RExC_open_parens) {
15253 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15254 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15255 if ( RExC_open_parens[paren] >= opnd ) {
15256 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15257 RExC_open_parens[paren] += size;
15259 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15261 if ( RExC_close_parens[paren] >= opnd ) {
15262 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15263 RExC_close_parens[paren] += size;
15265 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15270 while (src > opnd) {
15271 StructCopy(--src, --dst, regnode);
15272 #ifdef RE_TRACK_PATTERN_OFFSETS
15273 if (RExC_offsets) { /* MJD 20010112 */
15275 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15279 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15280 ? "Overwriting end of array!\n" : "OK",
15281 (UV)(src - RExC_emit_start),
15282 (UV)(dst - RExC_emit_start),
15283 (UV)RExC_offsets[0]));
15284 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15285 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15291 place = opnd; /* Op node, where operand used to be. */
15292 #ifdef RE_TRACK_PATTERN_OFFSETS
15293 if (RExC_offsets) { /* MJD */
15295 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15299 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15300 ? "Overwriting end of array!\n" : "OK",
15301 (UV)(place - RExC_emit_start),
15302 (UV)(RExC_parse - RExC_start),
15303 (UV)RExC_offsets[0]));
15304 Set_Node_Offset(place, RExC_parse);
15305 Set_Node_Length(place, 1);
15308 src = NEXTOPER(place);
15309 FILL_ADVANCE_NODE(place, op);
15310 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15311 Zero(src, offset, regnode);
15315 - regtail - set the next-pointer at the end of a node chain of p to val.
15316 - SEE ALSO: regtail_study
15318 /* TODO: All three parms should be const */
15320 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15321 const regnode *val,U32 depth)
15325 GET_RE_DEBUG_FLAGS_DECL;
15327 PERL_ARGS_ASSERT_REGTAIL;
15329 PERL_UNUSED_ARG(depth);
15335 /* Find last node. */
15338 regnode * const temp = regnext(scan);
15340 SV * const mysv=sv_newmortal();
15341 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15342 regprop(RExC_rx, mysv, scan, NULL);
15343 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15344 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15345 (temp == NULL ? "->" : ""),
15346 (temp == NULL ? PL_reg_name[OP(val)] : "")
15354 if (reg_off_by_arg[OP(scan)]) {
15355 ARG_SET(scan, val - scan);
15358 NEXT_OFF(scan) = val - scan;
15364 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15365 - Look for optimizable sequences at the same time.
15366 - currently only looks for EXACT chains.
15368 This is experimental code. The idea is to use this routine to perform
15369 in place optimizations on branches and groups as they are constructed,
15370 with the long term intention of removing optimization from study_chunk so
15371 that it is purely analytical.
15373 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15374 to control which is which.
15377 /* TODO: All four parms should be const */
15380 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15381 const regnode *val,U32 depth)
15386 #ifdef EXPERIMENTAL_INPLACESCAN
15389 GET_RE_DEBUG_FLAGS_DECL;
15391 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15397 /* Find last node. */
15401 regnode * const temp = regnext(scan);
15402 #ifdef EXPERIMENTAL_INPLACESCAN
15403 if (PL_regkind[OP(scan)] == EXACT) {
15404 bool unfolded_multi_char; /* Unexamined in this routine */
15405 if (join_exact(pRExC_state, scan, &min,
15406 &unfolded_multi_char, 1, val, depth+1))
15411 switch (OP(scan)) {
15414 case EXACTFA_NO_TRIE:
15419 if( exact == PSEUDO )
15421 else if ( exact != OP(scan) )
15430 SV * const mysv=sv_newmortal();
15431 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15432 regprop(RExC_rx, mysv, scan, NULL);
15433 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15434 SvPV_nolen_const(mysv),
15435 REG_NODE_NUM(scan),
15436 PL_reg_name[exact]);
15443 SV * const mysv_val=sv_newmortal();
15444 DEBUG_PARSE_MSG("");
15445 regprop(RExC_rx, mysv_val, val, NULL);
15446 PerlIO_printf(Perl_debug_log,
15447 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15448 SvPV_nolen_const(mysv_val),
15449 (IV)REG_NODE_NUM(val),
15453 if (reg_off_by_arg[OP(scan)]) {
15454 ARG_SET(scan, val - scan);
15457 NEXT_OFF(scan) = val - scan;
15465 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15470 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15475 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15477 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15478 if (flags & (1<<bit)) {
15479 if (!set++ && lead)
15480 PerlIO_printf(Perl_debug_log, "%s",lead);
15481 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15486 PerlIO_printf(Perl_debug_log, "\n");
15488 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15493 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15499 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15501 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15502 if (flags & (1<<bit)) {
15503 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15506 if (!set++ && lead)
15507 PerlIO_printf(Perl_debug_log, "%s",lead);
15508 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15511 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15512 if (!set++ && lead) {
15513 PerlIO_printf(Perl_debug_log, "%s",lead);
15516 case REGEX_UNICODE_CHARSET:
15517 PerlIO_printf(Perl_debug_log, "UNICODE");
15519 case REGEX_LOCALE_CHARSET:
15520 PerlIO_printf(Perl_debug_log, "LOCALE");
15522 case REGEX_ASCII_RESTRICTED_CHARSET:
15523 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15525 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15526 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15529 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15535 PerlIO_printf(Perl_debug_log, "\n");
15537 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15543 Perl_regdump(pTHX_ const regexp *r)
15547 SV * const sv = sv_newmortal();
15548 SV *dsv= sv_newmortal();
15549 RXi_GET_DECL(r,ri);
15550 GET_RE_DEBUG_FLAGS_DECL;
15552 PERL_ARGS_ASSERT_REGDUMP;
15554 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15556 /* Header fields of interest. */
15557 if (r->anchored_substr) {
15558 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15559 RE_SV_DUMPLEN(r->anchored_substr), 30);
15560 PerlIO_printf(Perl_debug_log,
15561 "anchored %s%s at %"IVdf" ",
15562 s, RE_SV_TAIL(r->anchored_substr),
15563 (IV)r->anchored_offset);
15564 } else if (r->anchored_utf8) {
15565 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15566 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15567 PerlIO_printf(Perl_debug_log,
15568 "anchored utf8 %s%s at %"IVdf" ",
15569 s, RE_SV_TAIL(r->anchored_utf8),
15570 (IV)r->anchored_offset);
15572 if (r->float_substr) {
15573 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15574 RE_SV_DUMPLEN(r->float_substr), 30);
15575 PerlIO_printf(Perl_debug_log,
15576 "floating %s%s at %"IVdf"..%"UVuf" ",
15577 s, RE_SV_TAIL(r->float_substr),
15578 (IV)r->float_min_offset, (UV)r->float_max_offset);
15579 } else if (r->float_utf8) {
15580 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15581 RE_SV_DUMPLEN(r->float_utf8), 30);
15582 PerlIO_printf(Perl_debug_log,
15583 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15584 s, RE_SV_TAIL(r->float_utf8),
15585 (IV)r->float_min_offset, (UV)r->float_max_offset);
15587 if (r->check_substr || r->check_utf8)
15588 PerlIO_printf(Perl_debug_log,
15590 (r->check_substr == r->float_substr
15591 && r->check_utf8 == r->float_utf8
15592 ? "(checking floating" : "(checking anchored"));
15593 if (r->intflags & PREGf_NOSCAN)
15594 PerlIO_printf(Perl_debug_log, " noscan");
15595 if (r->extflags & RXf_CHECK_ALL)
15596 PerlIO_printf(Perl_debug_log, " isall");
15597 if (r->check_substr || r->check_utf8)
15598 PerlIO_printf(Perl_debug_log, ") ");
15600 if (ri->regstclass) {
15601 regprop(r, sv, ri->regstclass, NULL);
15602 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15604 if (r->intflags & PREGf_ANCH) {
15605 PerlIO_printf(Perl_debug_log, "anchored");
15606 if (r->intflags & PREGf_ANCH_BOL)
15607 PerlIO_printf(Perl_debug_log, "(BOL)");
15608 if (r->intflags & PREGf_ANCH_MBOL)
15609 PerlIO_printf(Perl_debug_log, "(MBOL)");
15610 if (r->intflags & PREGf_ANCH_SBOL)
15611 PerlIO_printf(Perl_debug_log, "(SBOL)");
15612 if (r->intflags & PREGf_ANCH_GPOS)
15613 PerlIO_printf(Perl_debug_log, "(GPOS)");
15614 PerlIO_putc(Perl_debug_log, ' ');
15616 if (r->intflags & PREGf_GPOS_SEEN)
15617 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15618 if (r->intflags & PREGf_SKIP)
15619 PerlIO_printf(Perl_debug_log, "plus ");
15620 if (r->intflags & PREGf_IMPLICIT)
15621 PerlIO_printf(Perl_debug_log, "implicit ");
15622 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15623 if (r->extflags & RXf_EVAL_SEEN)
15624 PerlIO_printf(Perl_debug_log, "with eval ");
15625 PerlIO_printf(Perl_debug_log, "\n");
15627 regdump_extflags("r->extflags: ",r->extflags);
15628 regdump_intflags("r->intflags: ",r->intflags);
15631 PERL_ARGS_ASSERT_REGDUMP;
15632 PERL_UNUSED_CONTEXT;
15633 PERL_UNUSED_ARG(r);
15634 #endif /* DEBUGGING */
15638 - regprop - printable representation of opcode, with run time support
15642 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15648 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15649 static const char * const anyofs[] = {
15650 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15651 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15652 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15653 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15654 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15655 || _CC_VERTSPACE != 16
15656 #error Need to adjust order of anyofs[]
15693 RXi_GET_DECL(prog,progi);
15694 GET_RE_DEBUG_FLAGS_DECL;
15696 PERL_ARGS_ASSERT_REGPROP;
15700 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15701 /* It would be nice to FAIL() here, but this may be called from
15702 regexec.c, and it would be hard to supply pRExC_state. */
15703 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15704 (int)OP(o), (int)REGNODE_MAX);
15705 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15707 k = PL_regkind[OP(o)];
15710 sv_catpvs(sv, " ");
15711 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15712 * is a crude hack but it may be the best for now since
15713 * we have no flag "this EXACTish node was UTF-8"
15715 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15716 PERL_PV_ESCAPE_UNI_DETECT |
15717 PERL_PV_ESCAPE_NONASCII |
15718 PERL_PV_PRETTY_ELLIPSES |
15719 PERL_PV_PRETTY_LTGT |
15720 PERL_PV_PRETTY_NOCLEAR
15722 } else if (k == TRIE) {
15723 /* print the details of the trie in dumpuntil instead, as
15724 * progi->data isn't available here */
15725 const char op = OP(o);
15726 const U32 n = ARG(o);
15727 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15728 (reg_ac_data *)progi->data->data[n] :
15730 const reg_trie_data * const trie
15731 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15733 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15734 DEBUG_TRIE_COMPILE_r(
15735 Perl_sv_catpvf(aTHX_ sv,
15736 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15737 (UV)trie->startstate,
15738 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15739 (UV)trie->wordcount,
15742 (UV)TRIE_CHARCOUNT(trie),
15743 (UV)trie->uniquecharcount
15746 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
15747 sv_catpvs(sv, "[");
15748 (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
15750 : TRIE_BITMAP(trie));
15751 sv_catpvs(sv, "]");
15754 } else if (k == CURLY) {
15755 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
15756 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
15757 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
15759 else if (k == WHILEM && o->flags) /* Ordinal/of */
15760 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
15761 else if (k == REF || k == OPEN || k == CLOSE
15762 || k == GROUPP || OP(o)==ACCEPT)
15764 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
15765 if ( RXp_PAREN_NAMES(prog) ) {
15766 if ( k != REF || (OP(o) < NREF)) {
15767 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
15768 SV **name= av_fetch(list, ARG(o), 0 );
15770 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15773 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
15774 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
15775 I32 *nums=(I32*)SvPVX(sv_dat);
15776 SV **name= av_fetch(list, nums[0], 0 );
15779 for ( n=0; n<SvIVX(sv_dat); n++ ) {
15780 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
15781 (n ? "," : ""), (IV)nums[n]);
15783 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
15787 if ( k == REF && reginfo) {
15788 U32 n = ARG(o); /* which paren pair */
15789 I32 ln = prog->offs[n].start;
15790 if (prog->lastparen < n || ln == -1)
15791 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
15792 else if (ln == prog->offs[n].end)
15793 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
15795 const char *s = reginfo->strbeg + ln;
15796 Perl_sv_catpvf(aTHX_ sv, ": ");
15797 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
15798 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
15801 } else if (k == GOSUB)
15802 /* Paren and offset */
15803 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
15804 else if (k == VERB) {
15806 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
15807 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
15808 } else if (k == LOGICAL)
15809 /* 2: embedded, otherwise 1 */
15810 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
15811 else if (k == ANYOF) {
15812 const U8 flags = ANYOF_FLAGS(o);
15816 if (flags & ANYOF_LOCALE_FLAGS)
15817 sv_catpvs(sv, "{loc}");
15818 if (flags & ANYOF_LOC_FOLD)
15819 sv_catpvs(sv, "{i}");
15820 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
15821 if (flags & ANYOF_INVERT)
15822 sv_catpvs(sv, "^");
15824 /* output what the standard cp 0-255 bitmap matches */
15825 do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
15827 /* output any special charclass tests (used entirely under use
15829 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
15831 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
15832 if (ANYOF_POSIXL_TEST(o,i)) {
15833 sv_catpv(sv, anyofs[i]);
15839 if ((flags & (ANYOF_ABOVE_LATIN1_ALL
15841 |ANYOF_NONBITMAP_NON_UTF8
15845 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
15846 if (flags & ANYOF_INVERT)
15847 /*make sure the invert info is in each */
15848 sv_catpvs(sv, "^");
15851 if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) {
15852 sv_catpvs(sv, "{non-utf8-latin1-all}");
15855 /* output information about the unicode matching */
15856 if (flags & ANYOF_ABOVE_LATIN1_ALL)
15857 sv_catpvs(sv, "{unicode_all}");
15858 else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
15859 SV *lv; /* Set if there is something outside the bit map. */
15860 bool byte_output = FALSE; /* If something in the bitmap has
15862 SV *only_utf8_locale;
15864 /* Get the stuff that wasn't in the bitmap */
15865 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
15866 &lv, &only_utf8_locale);
15867 if (lv && lv != &PL_sv_undef) {
15868 char *s = savesvpv(lv);
15869 char * const origs = s;
15871 while (*s && *s != '\n')
15875 const char * const t = ++s;
15877 if (flags & ANYOF_NONBITMAP_NON_UTF8) {
15878 sv_catpvs(sv, "{outside bitmap}");
15881 sv_catpvs(sv, "{utf8}");
15885 sv_catpvs(sv, " ");
15891 /* Truncate very long output */
15892 if (s - origs > 256) {
15893 Perl_sv_catpvf(aTHX_ sv,
15895 (int) (s - origs - 1),
15901 else if (*s == '\t') {
15915 SvREFCNT_dec_NN(lv);
15918 if ((flags & ANYOF_LOC_FOLD)
15919 && only_utf8_locale
15920 && only_utf8_locale != &PL_sv_undef)
15923 int max_entries = 256;
15925 sv_catpvs(sv, "{utf8 locale}");
15926 invlist_iterinit(only_utf8_locale);
15927 while (invlist_iternext(only_utf8_locale,
15929 put_range(sv, start, end);
15931 if (max_entries < 0) {
15932 sv_catpvs(sv, "...");
15936 invlist_iterfinish(only_utf8_locale);
15941 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
15943 else if (k == POSIXD || k == NPOSIXD) {
15944 U8 index = FLAGS(o) * 2;
15945 if (index < C_ARRAY_LENGTH(anyofs)) {
15946 if (*anyofs[index] != '[') {
15949 sv_catpv(sv, anyofs[index]);
15950 if (*anyofs[index] != '[') {
15955 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
15958 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
15959 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
15961 PERL_UNUSED_CONTEXT;
15962 PERL_UNUSED_ARG(sv);
15963 PERL_UNUSED_ARG(o);
15964 PERL_UNUSED_ARG(prog);
15965 PERL_UNUSED_ARG(reginfo);
15966 #endif /* DEBUGGING */
15972 Perl_re_intuit_string(pTHX_ REGEXP * const r)
15973 { /* Assume that RE_INTUIT is set */
15975 struct regexp *const prog = ReANY(r);
15976 GET_RE_DEBUG_FLAGS_DECL;
15978 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
15979 PERL_UNUSED_CONTEXT;
15983 const char * const s = SvPV_nolen_const(prog->check_substr
15984 ? prog->check_substr : prog->check_utf8);
15986 if (!PL_colorset) reginitcolors();
15987 PerlIO_printf(Perl_debug_log,
15988 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
15990 prog->check_substr ? "" : "utf8 ",
15991 PL_colors[5],PL_colors[0],
15994 (strlen(s) > 60 ? "..." : ""));
15997 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16003 handles refcounting and freeing the perl core regexp structure. When
16004 it is necessary to actually free the structure the first thing it
16005 does is call the 'free' method of the regexp_engine associated to
16006 the regexp, allowing the handling of the void *pprivate; member
16007 first. (This routine is not overridable by extensions, which is why
16008 the extensions free is called first.)
16010 See regdupe and regdupe_internal if you change anything here.
16012 #ifndef PERL_IN_XSUB_RE
16014 Perl_pregfree(pTHX_ REGEXP *r)
16020 Perl_pregfree2(pTHX_ REGEXP *rx)
16023 struct regexp *const r = ReANY(rx);
16024 GET_RE_DEBUG_FLAGS_DECL;
16026 PERL_ARGS_ASSERT_PREGFREE2;
16028 if (r->mother_re) {
16029 ReREFCNT_dec(r->mother_re);
16031 CALLREGFREE_PVT(rx); /* free the private data */
16032 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16033 Safefree(r->xpv_len_u.xpvlenu_pv);
16036 SvREFCNT_dec(r->anchored_substr);
16037 SvREFCNT_dec(r->anchored_utf8);
16038 SvREFCNT_dec(r->float_substr);
16039 SvREFCNT_dec(r->float_utf8);
16040 Safefree(r->substrs);
16042 RX_MATCH_COPY_FREE(rx);
16043 #ifdef PERL_ANY_COW
16044 SvREFCNT_dec(r->saved_copy);
16047 SvREFCNT_dec(r->qr_anoncv);
16048 rx->sv_u.svu_rx = 0;
16053 This is a hacky workaround to the structural issue of match results
16054 being stored in the regexp structure which is in turn stored in
16055 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16056 could be PL_curpm in multiple contexts, and could require multiple
16057 result sets being associated with the pattern simultaneously, such
16058 as when doing a recursive match with (??{$qr})
16060 The solution is to make a lightweight copy of the regexp structure
16061 when a qr// is returned from the code executed by (??{$qr}) this
16062 lightweight copy doesn't actually own any of its data except for
16063 the starp/end and the actual regexp structure itself.
16069 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16071 struct regexp *ret;
16072 struct regexp *const r = ReANY(rx);
16073 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16075 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16078 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16080 SvOK_off((SV *)ret_x);
16082 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16083 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16084 made both spots point to the same regexp body.) */
16085 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16086 assert(!SvPVX(ret_x));
16087 ret_x->sv_u.svu_rx = temp->sv_any;
16088 temp->sv_any = NULL;
16089 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16090 SvREFCNT_dec_NN(temp);
16091 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16092 ing below will not set it. */
16093 SvCUR_set(ret_x, SvCUR(rx));
16096 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16097 sv_force_normal(sv) is called. */
16099 ret = ReANY(ret_x);
16101 SvFLAGS(ret_x) |= SvUTF8(rx);
16102 /* We share the same string buffer as the original regexp, on which we
16103 hold a reference count, incremented when mother_re is set below.
16104 The string pointer is copied here, being part of the regexp struct.
16106 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16107 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16109 const I32 npar = r->nparens+1;
16110 Newx(ret->offs, npar, regexp_paren_pair);
16111 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16114 Newx(ret->substrs, 1, struct reg_substr_data);
16115 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16117 SvREFCNT_inc_void(ret->anchored_substr);
16118 SvREFCNT_inc_void(ret->anchored_utf8);
16119 SvREFCNT_inc_void(ret->float_substr);
16120 SvREFCNT_inc_void(ret->float_utf8);
16122 /* check_substr and check_utf8, if non-NULL, point to either their
16123 anchored or float namesakes, and don't hold a second reference. */
16125 RX_MATCH_COPIED_off(ret_x);
16126 #ifdef PERL_ANY_COW
16127 ret->saved_copy = NULL;
16129 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16130 SvREFCNT_inc_void(ret->qr_anoncv);
16136 /* regfree_internal()
16138 Free the private data in a regexp. This is overloadable by
16139 extensions. Perl takes care of the regexp structure in pregfree(),
16140 this covers the *pprivate pointer which technically perl doesn't
16141 know about, however of course we have to handle the
16142 regexp_internal structure when no extension is in use.
16144 Note this is called before freeing anything in the regexp
16149 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16152 struct regexp *const r = ReANY(rx);
16153 RXi_GET_DECL(r,ri);
16154 GET_RE_DEBUG_FLAGS_DECL;
16156 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16162 SV *dsv= sv_newmortal();
16163 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16164 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16165 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16166 PL_colors[4],PL_colors[5],s);
16169 #ifdef RE_TRACK_PATTERN_OFFSETS
16171 Safefree(ri->u.offsets); /* 20010421 MJD */
16173 if (ri->code_blocks) {
16175 for (n = 0; n < ri->num_code_blocks; n++)
16176 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16177 Safefree(ri->code_blocks);
16181 int n = ri->data->count;
16184 /* If you add a ->what type here, update the comment in regcomp.h */
16185 switch (ri->data->what[n]) {
16191 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16194 Safefree(ri->data->data[n]);
16200 { /* Aho Corasick add-on structure for a trie node.
16201 Used in stclass optimization only */
16203 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16205 refcount = --aho->refcount;
16208 PerlMemShared_free(aho->states);
16209 PerlMemShared_free(aho->fail);
16210 /* do this last!!!! */
16211 PerlMemShared_free(ri->data->data[n]);
16212 /* we should only ever get called once, so
16213 * assert as much, and also guard the free
16214 * which /might/ happen twice. At the least
16215 * it will make code anlyzers happy and it
16216 * doesn't cost much. - Yves */
16217 assert(ri->regstclass);
16218 if (ri->regstclass) {
16219 PerlMemShared_free(ri->regstclass);
16220 ri->regstclass = 0;
16227 /* trie structure. */
16229 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16231 refcount = --trie->refcount;
16234 PerlMemShared_free(trie->charmap);
16235 PerlMemShared_free(trie->states);
16236 PerlMemShared_free(trie->trans);
16238 PerlMemShared_free(trie->bitmap);
16240 PerlMemShared_free(trie->jump);
16241 PerlMemShared_free(trie->wordinfo);
16242 /* do this last!!!! */
16243 PerlMemShared_free(ri->data->data[n]);
16248 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16249 ri->data->what[n]);
16252 Safefree(ri->data->what);
16253 Safefree(ri->data);
16259 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16260 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16261 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16264 re_dup - duplicate a regexp.
16266 This routine is expected to clone a given regexp structure. It is only
16267 compiled under USE_ITHREADS.
16269 After all of the core data stored in struct regexp is duplicated
16270 the regexp_engine.dupe method is used to copy any private data
16271 stored in the *pprivate pointer. This allows extensions to handle
16272 any duplication it needs to do.
16274 See pregfree() and regfree_internal() if you change anything here.
16276 #if defined(USE_ITHREADS)
16277 #ifndef PERL_IN_XSUB_RE
16279 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16283 const struct regexp *r = ReANY(sstr);
16284 struct regexp *ret = ReANY(dstr);
16286 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16288 npar = r->nparens+1;
16289 Newx(ret->offs, npar, regexp_paren_pair);
16290 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16292 if (ret->substrs) {
16293 /* Do it this way to avoid reading from *r after the StructCopy().
16294 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16295 cache, it doesn't matter. */
16296 const bool anchored = r->check_substr
16297 ? r->check_substr == r->anchored_substr
16298 : r->check_utf8 == r->anchored_utf8;
16299 Newx(ret->substrs, 1, struct reg_substr_data);
16300 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16302 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16303 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16304 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16305 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16307 /* check_substr and check_utf8, if non-NULL, point to either their
16308 anchored or float namesakes, and don't hold a second reference. */
16310 if (ret->check_substr) {
16312 assert(r->check_utf8 == r->anchored_utf8);
16313 ret->check_substr = ret->anchored_substr;
16314 ret->check_utf8 = ret->anchored_utf8;
16316 assert(r->check_substr == r->float_substr);
16317 assert(r->check_utf8 == r->float_utf8);
16318 ret->check_substr = ret->float_substr;
16319 ret->check_utf8 = ret->float_utf8;
16321 } else if (ret->check_utf8) {
16323 ret->check_utf8 = ret->anchored_utf8;
16325 ret->check_utf8 = ret->float_utf8;
16330 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16331 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16334 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16336 if (RX_MATCH_COPIED(dstr))
16337 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16339 ret->subbeg = NULL;
16340 #ifdef PERL_ANY_COW
16341 ret->saved_copy = NULL;
16344 /* Whether mother_re be set or no, we need to copy the string. We
16345 cannot refrain from copying it when the storage points directly to
16346 our mother regexp, because that's
16347 1: a buffer in a different thread
16348 2: something we no longer hold a reference on
16349 so we need to copy it locally. */
16350 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16351 ret->mother_re = NULL;
16353 #endif /* PERL_IN_XSUB_RE */
16358 This is the internal complement to regdupe() which is used to copy
16359 the structure pointed to by the *pprivate pointer in the regexp.
16360 This is the core version of the extension overridable cloning hook.
16361 The regexp structure being duplicated will be copied by perl prior
16362 to this and will be provided as the regexp *r argument, however
16363 with the /old/ structures pprivate pointer value. Thus this routine
16364 may override any copying normally done by perl.
16366 It returns a pointer to the new regexp_internal structure.
16370 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16373 struct regexp *const r = ReANY(rx);
16374 regexp_internal *reti;
16376 RXi_GET_DECL(r,ri);
16378 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16382 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16383 char, regexp_internal);
16384 Copy(ri->program, reti->program, len+1, regnode);
16386 reti->num_code_blocks = ri->num_code_blocks;
16387 if (ri->code_blocks) {
16389 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16390 struct reg_code_block);
16391 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16392 struct reg_code_block);
16393 for (n = 0; n < ri->num_code_blocks; n++)
16394 reti->code_blocks[n].src_regex = (REGEXP*)
16395 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16398 reti->code_blocks = NULL;
16400 reti->regstclass = NULL;
16403 struct reg_data *d;
16404 const int count = ri->data->count;
16407 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16408 char, struct reg_data);
16409 Newx(d->what, count, U8);
16412 for (i = 0; i < count; i++) {
16413 d->what[i] = ri->data->what[i];
16414 switch (d->what[i]) {
16415 /* see also regcomp.h and regfree_internal() */
16416 case 'a': /* actually an AV, but the dup function is identical. */
16420 case 'u': /* actually an HV, but the dup function is identical. */
16421 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16424 /* This is cheating. */
16425 Newx(d->data[i], 1, regnode_ssc);
16426 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16427 reti->regstclass = (regnode*)d->data[i];
16430 /* Trie stclasses are readonly and can thus be shared
16431 * without duplication. We free the stclass in pregfree
16432 * when the corresponding reg_ac_data struct is freed.
16434 reti->regstclass= ri->regstclass;
16438 ((reg_trie_data*)ri->data->data[i])->refcount++;
16443 d->data[i] = ri->data->data[i];
16446 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16447 ri->data->what[i]);
16456 reti->name_list_idx = ri->name_list_idx;
16458 #ifdef RE_TRACK_PATTERN_OFFSETS
16459 if (ri->u.offsets) {
16460 Newx(reti->u.offsets, 2*len+1, U32);
16461 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16464 SetProgLen(reti,len);
16467 return (void*)reti;
16470 #endif /* USE_ITHREADS */
16472 #ifndef PERL_IN_XSUB_RE
16475 - regnext - dig the "next" pointer out of a node
16478 Perl_regnext(pTHX_ regnode *p)
16486 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16487 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16488 (int)OP(p), (int)REGNODE_MAX);
16491 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16500 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16503 STRLEN l1 = strlen(pat1);
16504 STRLEN l2 = strlen(pat2);
16507 const char *message;
16509 PERL_ARGS_ASSERT_RE_CROAK2;
16515 Copy(pat1, buf, l1 , char);
16516 Copy(pat2, buf + l1, l2 , char);
16517 buf[l1 + l2] = '\n';
16518 buf[l1 + l2 + 1] = '\0';
16519 va_start(args, pat2);
16520 msv = vmess(buf, &args);
16522 message = SvPV_const(msv,l1);
16525 Copy(message, buf, l1 , char);
16526 /* l1-1 to avoid \n */
16527 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16530 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
16532 #ifndef PERL_IN_XSUB_RE
16534 Perl_save_re_context(pTHX)
16538 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
16540 const REGEXP * const rx = PM_GETRE(PL_curpm);
16543 for (i = 1; i <= RX_NPARENS(rx); i++) {
16544 char digits[TYPE_CHARS(long)];
16545 const STRLEN len = my_snprintf(digits, sizeof(digits),
16547 GV *const *const gvp
16548 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
16551 GV * const gv = *gvp;
16552 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
16564 S_put_byte(pTHX_ SV *sv, int c)
16566 PERL_ARGS_ASSERT_PUT_BYTE;
16570 case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
16571 case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
16572 case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
16573 case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
16574 case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
16577 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
16582 const char string = c;
16583 if (c == '-' || c == ']' || c == '\\' || c == '^')
16584 sv_catpvs(sv, "\\");
16585 sv_catpvn(sv, &string, 1);
16590 S_put_range(pTHX_ SV *sv, UV start, UV end)
16593 /* Appends to 'sv' a displayable version of the range of code points from
16594 * 'start' to 'end' */
16596 assert(start <= end);
16598 PERL_ARGS_ASSERT_PUT_RANGE;
16600 if (end - start < 3) { /* Individual chars in short ranges */
16601 for (; start <= end; start++)
16602 put_byte(sv, start);
16604 else if ( end > 255
16605 || ! isALPHANUMERIC(start)
16606 || ! isALPHANUMERIC(end)
16607 || isDIGIT(start) != isDIGIT(end)
16608 || isUPPER(start) != isUPPER(end)
16609 || isLOWER(start) != isLOWER(end)
16611 /* This final test should get optimized out except on EBCDIC
16612 * platforms, where it causes ranges that cross discontinuities
16613 * like i/j to be shown as hex instead of the misleading,
16614 * e.g. H-K (since that range includes more than H, I, J, K).
16616 || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
16618 Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
16620 (end < 256) ? end : 255);
16622 else { /* Here, the ends of the range are both digits, or both uppercase,
16623 or both lowercase; and there's no discontinuity in the range
16624 (which could happen on EBCDIC platforms) */
16625 put_byte(sv, start);
16626 sv_catpvs(sv, "-");
16632 S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
16634 /* Appends to 'sv' a displayable version of the innards of the bracketed
16635 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16636 * output anything */
16639 bool has_output_anything = FALSE;
16641 PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
16643 for (i = 0; i < 256; i++) {
16644 if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
16646 /* The character at index i should be output. Find the next
16647 * character that should NOT be output */
16649 for (j = i + 1; j <= 256; j++) {
16650 if (! BITMAP_TEST((U8 *) bitmap, j)) {
16655 /* Everything between them is a single range that should be output
16657 put_range(sv, i, j - 1);
16658 has_output_anything = TRUE;
16663 return has_output_anything;
16666 #define CLEAR_OPTSTART \
16667 if (optstart) STMT_START { \
16668 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
16669 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
16673 #define DUMPUNTIL(b,e) \
16675 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
16677 STATIC const regnode *
16678 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
16679 const regnode *last, const regnode *plast,
16680 SV* sv, I32 indent, U32 depth)
16683 U8 op = PSEUDO; /* Arbitrary non-END op. */
16684 const regnode *next;
16685 const regnode *optstart= NULL;
16687 RXi_GET_DECL(r,ri);
16688 GET_RE_DEBUG_FLAGS_DECL;
16690 PERL_ARGS_ASSERT_DUMPUNTIL;
16692 #ifdef DEBUG_DUMPUNTIL
16693 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
16694 last ? last-start : 0,plast ? plast-start : 0);
16697 if (plast && plast < last)
16700 while (PL_regkind[op] != END && (!last || node < last)) {
16702 /* While that wasn't END last time... */
16705 if (op == CLOSE || op == WHILEM)
16707 next = regnext((regnode *)node);
16710 if (OP(node) == OPTIMIZED) {
16711 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
16718 regprop(r, sv, node, NULL);
16719 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
16720 (int)(2*indent + 1), "", SvPVX_const(sv));
16722 if (OP(node) != OPTIMIZED) {
16723 if (next == NULL) /* Next ptr. */
16724 PerlIO_printf(Perl_debug_log, " (0)");
16725 else if (PL_regkind[(U8)op] == BRANCH
16726 && PL_regkind[OP(next)] != BRANCH )
16727 PerlIO_printf(Perl_debug_log, " (FAIL)");
16729 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
16730 (void)PerlIO_putc(Perl_debug_log, '\n');
16734 if (PL_regkind[(U8)op] == BRANCHJ) {
16737 const regnode *nnode = (OP(next) == LONGJMP
16738 ? regnext((regnode *)next)
16740 if (last && nnode > last)
16742 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
16745 else if (PL_regkind[(U8)op] == BRANCH) {
16747 DUMPUNTIL(NEXTOPER(node), next);
16749 else if ( PL_regkind[(U8)op] == TRIE ) {
16750 const regnode *this_trie = node;
16751 const char op = OP(node);
16752 const U32 n = ARG(node);
16753 const reg_ac_data * const ac = op>=AHOCORASICK ?
16754 (reg_ac_data *)ri->data->data[n] :
16756 const reg_trie_data * const trie =
16757 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
16759 AV *const trie_words
16760 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
16762 const regnode *nextbranch= NULL;
16765 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
16766 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
16768 PerlIO_printf(Perl_debug_log, "%*s%s ",
16769 (int)(2*(indent+3)), "",
16771 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
16772 SvCUR(*elem_ptr), 60,
16773 PL_colors[0], PL_colors[1],
16775 ? PERL_PV_ESCAPE_UNI
16777 | PERL_PV_PRETTY_ELLIPSES
16778 | PERL_PV_PRETTY_LTGT
16783 U16 dist= trie->jump[word_idx+1];
16784 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
16785 (UV)((dist ? this_trie + dist : next) - start));
16788 nextbranch= this_trie + trie->jump[0];
16789 DUMPUNTIL(this_trie + dist, nextbranch);
16791 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
16792 nextbranch= regnext((regnode *)nextbranch);
16794 PerlIO_printf(Perl_debug_log, "\n");
16797 if (last && next > last)
16802 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
16803 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
16804 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
16806 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
16808 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
16810 else if ( op == PLUS || op == STAR) {
16811 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
16813 else if (PL_regkind[(U8)op] == ANYOF) {
16814 /* arglen 1 + class block */
16815 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
16816 ? ANYOF_POSIXL_SKIP
16818 node = NEXTOPER(node);
16820 else if (PL_regkind[(U8)op] == EXACT) {
16821 /* Literal string, where present. */
16822 node += NODE_SZ_STR(node) - 1;
16823 node = NEXTOPER(node);
16826 node = NEXTOPER(node);
16827 node += regarglen[(U8)op];
16829 if (op == CURLYX || op == OPEN)
16833 #ifdef DEBUG_DUMPUNTIL
16834 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
16839 #endif /* DEBUGGING */
16843 * c-indentation-style: bsd
16844 * c-basic-offset: 4
16845 * indent-tabs-mode: nil
16848 * ex: set ts=8 sts=4 sw=4 et: