5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
75 #undef PERL_IN_XSUB_RE
76 #define PERL_IN_XSUB_RE 1
78 #undef PERL_IN_XSUB_RE
80 #ifndef PERL_IN_XSUB_RE
85 #ifdef PERL_IN_XSUB_RE
87 EXTERN_C const struct regexp_engine my_reg_engine;
92 #include "dquote_static.c"
93 #include "inline_invlist.c"
94 #include "unicode_constants.h"
96 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
99 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
100 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
101 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
104 #define STATIC static
108 #define MIN(a,b) ((a) < (b) ? (a) : (b))
111 /* this is a chain of data about sub patterns we are processing that
112 need to be handled separately/specially in study_chunk. Its so
113 we can simulate recursion without losing state. */
115 typedef struct scan_frame {
116 regnode *last_regnode; /* last node to process in this frame */
117 regnode *next_regnode; /* next node to process when last is reached */
118 U32 prev_recursed_depth;
119 I32 stopparen; /* what stopparen do we use */
120 U32 is_top_frame; /* what flags do we use? */
122 struct scan_frame *this_prev_frame; /* this previous frame */
123 struct scan_frame *prev_frame; /* previous frame */
124 struct scan_frame *next_frame; /* next frame */
127 /* Certain characters are output as a sequence with the first being a
129 #define isBACKSLASHED_PUNCT(c) \
130 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
133 struct RExC_state_t {
134 U32 flags; /* RXf_* are we folding, multilining? */
135 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
136 char *precomp; /* uncompiled string. */
137 REGEXP *rx_sv; /* The SV that is the regexp. */
138 regexp *rx; /* perl core regexp structure */
139 regexp_internal *rxi; /* internal data for regexp object
141 char *start; /* Start of input for compile */
142 char *end; /* End of input for compile */
143 char *parse; /* Input-scan pointer. */
144 SSize_t whilem_seen; /* number of WHILEM in this expr */
145 regnode *emit_start; /* Start of emitted-code area */
146 regnode *emit_bound; /* First regnode outside of the
148 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
149 implies compiling, so don't emit */
150 regnode_ssc emit_dummy; /* placeholder for emit to point to;
151 large enough for the largest
152 non-EXACTish node, so can use it as
154 I32 naughty; /* How bad is this pattern? */
155 I32 sawback; /* Did we see \1, ...? */
157 SSize_t size; /* Code size. */
158 I32 npar; /* Capture buffer count, (OPEN) plus
159 one. ("par" 0 is the whole
161 I32 nestroot; /* root parens we are in - used by
165 regnode **open_parens; /* pointers to open parens */
166 regnode **close_parens; /* pointers to close parens */
167 regnode *opend; /* END node in program */
168 I32 utf8; /* whether the pattern is utf8 or not */
169 I32 orig_utf8; /* whether the pattern was originally in utf8 */
170 /* XXX use this for future optimisation of case
171 * where pattern must be upgraded to utf8. */
172 I32 uni_semantics; /* If a d charset modifier should use unicode
173 rules, even if the pattern is not in
175 HV *paren_names; /* Paren names */
177 regnode **recurse; /* Recurse regops */
178 I32 recurse_count; /* Number of recurse regops */
179 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
181 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
185 I32 override_recoding;
187 I32 recode_x_to_native;
189 I32 in_multi_char_class;
190 struct reg_code_block *code_blocks; /* positions of literal (?{})
192 int num_code_blocks; /* size of code_blocks[] */
193 int code_index; /* next code_blocks[] slot */
194 SSize_t maxlen; /* mininum possible number of chars in string to match */
195 scan_frame *frame_head;
196 scan_frame *frame_last;
199 #ifdef ADD_TO_REGEXEC
200 char *starttry; /* -Dr: where regtry was called. */
201 #define RExC_starttry (pRExC_state->starttry)
203 SV *runtime_code_qr; /* qr with the runtime code blocks */
205 const char *lastparse;
207 AV *paren_name_list; /* idx -> name */
208 U32 study_chunk_recursed_count;
211 #define RExC_lastparse (pRExC_state->lastparse)
212 #define RExC_lastnum (pRExC_state->lastnum)
213 #define RExC_paren_name_list (pRExC_state->paren_name_list)
214 #define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
215 #define RExC_mysv (pRExC_state->mysv1)
216 #define RExC_mysv1 (pRExC_state->mysv1)
217 #define RExC_mysv2 (pRExC_state->mysv2)
222 #define RExC_flags (pRExC_state->flags)
223 #define RExC_pm_flags (pRExC_state->pm_flags)
224 #define RExC_precomp (pRExC_state->precomp)
225 #define RExC_rx_sv (pRExC_state->rx_sv)
226 #define RExC_rx (pRExC_state->rx)
227 #define RExC_rxi (pRExC_state->rxi)
228 #define RExC_start (pRExC_state->start)
229 #define RExC_end (pRExC_state->end)
230 #define RExC_parse (pRExC_state->parse)
231 #define RExC_whilem_seen (pRExC_state->whilem_seen)
232 #ifdef RE_TRACK_PATTERN_OFFSETS
233 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
236 #define RExC_emit (pRExC_state->emit)
237 #define RExC_emit_dummy (pRExC_state->emit_dummy)
238 #define RExC_emit_start (pRExC_state->emit_start)
239 #define RExC_emit_bound (pRExC_state->emit_bound)
240 #define RExC_sawback (pRExC_state->sawback)
241 #define RExC_seen (pRExC_state->seen)
242 #define RExC_size (pRExC_state->size)
243 #define RExC_maxlen (pRExC_state->maxlen)
244 #define RExC_npar (pRExC_state->npar)
245 #define RExC_nestroot (pRExC_state->nestroot)
246 #define RExC_extralen (pRExC_state->extralen)
247 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
248 #define RExC_utf8 (pRExC_state->utf8)
249 #define RExC_uni_semantics (pRExC_state->uni_semantics)
250 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
251 #define RExC_open_parens (pRExC_state->open_parens)
252 #define RExC_close_parens (pRExC_state->close_parens)
253 #define RExC_opend (pRExC_state->opend)
254 #define RExC_paren_names (pRExC_state->paren_names)
255 #define RExC_recurse (pRExC_state->recurse)
256 #define RExC_recurse_count (pRExC_state->recurse_count)
257 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
258 #define RExC_study_chunk_recursed_bytes \
259 (pRExC_state->study_chunk_recursed_bytes)
260 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
261 #define RExC_contains_locale (pRExC_state->contains_locale)
262 #define RExC_contains_i (pRExC_state->contains_i)
263 #define RExC_override_recoding (pRExC_state->override_recoding)
265 # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
267 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
268 #define RExC_frame_head (pRExC_state->frame_head)
269 #define RExC_frame_last (pRExC_state->frame_last)
270 #define RExC_frame_count (pRExC_state->frame_count)
271 #define RExC_strict (pRExC_state->strict)
273 /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
274 * a flag to disable back-off on the fixed/floating substrings - if it's
275 * a high complexity pattern we assume the benefit of avoiding a full match
276 * is worth the cost of checking for the substrings even if they rarely help.
278 #define RExC_naughty (pRExC_state->naughty)
279 #define TOO_NAUGHTY (10)
280 #define MARK_NAUGHTY(add) \
281 if (RExC_naughty < TOO_NAUGHTY) \
282 RExC_naughty += (add)
283 #define MARK_NAUGHTY_EXP(exp, add) \
284 if (RExC_naughty < TOO_NAUGHTY) \
285 RExC_naughty += RExC_naughty / (exp) + (add)
287 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
288 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
289 ((*s) == '{' && regcurly(s)))
292 * Flags to be passed up and down.
294 #define WORST 0 /* Worst case. */
295 #define HASWIDTH 0x01 /* Known to match non-null strings. */
297 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
298 * character. (There needs to be a case: in the switch statement in regexec.c
299 * for any node marked SIMPLE.) Note that this is not the same thing as
302 #define SPSTART 0x04 /* Starts with * or + */
303 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
304 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
305 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
307 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
309 /* whether trie related optimizations are enabled */
310 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
311 #define TRIE_STUDY_OPT
312 #define FULL_TRIE_STUDY
318 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
319 #define PBITVAL(paren) (1 << ((paren) & 7))
320 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
321 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
322 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
324 #define REQUIRE_UTF8 STMT_START { \
326 *flagp = RESTART_UTF8; \
331 /* This converts the named class defined in regcomp.h to its equivalent class
332 * number defined in handy.h. */
333 #define namedclass_to_classnum(class) ((int) ((class) / 2))
334 #define classnum_to_namedclass(classnum) ((classnum) * 2)
336 #define _invlist_union_complement_2nd(a, b, output) \
337 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
338 #define _invlist_intersection_complement_2nd(a, b, output) \
339 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
341 /* About scan_data_t.
343 During optimisation we recurse through the regexp program performing
344 various inplace (keyhole style) optimisations. In addition study_chunk
345 and scan_commit populate this data structure with information about
346 what strings MUST appear in the pattern. We look for the longest
347 string that must appear at a fixed location, and we look for the
348 longest string that may appear at a floating location. So for instance
353 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
354 strings (because they follow a .* construct). study_chunk will identify
355 both FOO and BAR as being the longest fixed and floating strings respectively.
357 The strings can be composites, for instance
361 will result in a composite fixed substring 'foo'.
363 For each string some basic information is maintained:
365 - offset or min_offset
366 This is the position the string must appear at, or not before.
367 It also implicitly (when combined with minlenp) tells us how many
368 characters must match before the string we are searching for.
369 Likewise when combined with minlenp and the length of the string it
370 tells us how many characters must appear after the string we have
374 Only used for floating strings. This is the rightmost point that
375 the string can appear at. If set to SSize_t_MAX it indicates that the
376 string can occur infinitely far to the right.
379 A pointer to the minimum number of characters of the pattern that the
380 string was found inside. This is important as in the case of positive
381 lookahead or positive lookbehind we can have multiple patterns
386 The minimum length of the pattern overall is 3, the minimum length
387 of the lookahead part is 3, but the minimum length of the part that
388 will actually match is 1. So 'FOO's minimum length is 3, but the
389 minimum length for the F is 1. This is important as the minimum length
390 is used to determine offsets in front of and behind the string being
391 looked for. Since strings can be composites this is the length of the
392 pattern at the time it was committed with a scan_commit. Note that
393 the length is calculated by study_chunk, so that the minimum lengths
394 are not known until the full pattern has been compiled, thus the
395 pointer to the value.
399 In the case of lookbehind the string being searched for can be
400 offset past the start point of the final matching string.
401 If this value was just blithely removed from the min_offset it would
402 invalidate some of the calculations for how many chars must match
403 before or after (as they are derived from min_offset and minlen and
404 the length of the string being searched for).
405 When the final pattern is compiled and the data is moved from the
406 scan_data_t structure into the regexp structure the information
407 about lookbehind is factored in, with the information that would
408 have been lost precalculated in the end_shift field for the
411 The fields pos_min and pos_delta are used to store the minimum offset
412 and the delta to the maximum offset at the current point in the pattern.
416 typedef struct scan_data_t {
417 /*I32 len_min; unused */
418 /*I32 len_delta; unused */
422 SSize_t last_end; /* min value, <0 unless valid. */
423 SSize_t last_start_min;
424 SSize_t last_start_max;
425 SV **longest; /* Either &l_fixed, or &l_float. */
426 SV *longest_fixed; /* longest fixed string found in pattern */
427 SSize_t offset_fixed; /* offset where it starts */
428 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
429 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
430 SV *longest_float; /* longest floating string found in pattern */
431 SSize_t offset_float_min; /* earliest point in string it can appear */
432 SSize_t offset_float_max; /* latest point in string it can appear */
433 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
434 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
437 SSize_t *last_closep;
438 regnode_ssc *start_class;
442 * Forward declarations for pregcomp()'s friends.
445 static const scan_data_t zero_scan_data =
446 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
448 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
449 #define SF_BEFORE_SEOL 0x0001
450 #define SF_BEFORE_MEOL 0x0002
451 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
452 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
454 #define SF_FIX_SHIFT_EOL (+2)
455 #define SF_FL_SHIFT_EOL (+4)
457 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
458 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
460 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
461 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
462 #define SF_IS_INF 0x0040
463 #define SF_HAS_PAR 0x0080
464 #define SF_IN_PAR 0x0100
465 #define SF_HAS_EVAL 0x0200
466 #define SCF_DO_SUBSTR 0x0400
467 #define SCF_DO_STCLASS_AND 0x0800
468 #define SCF_DO_STCLASS_OR 0x1000
469 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
470 #define SCF_WHILEM_VISITED_POS 0x2000
472 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
473 #define SCF_SEEN_ACCEPT 0x8000
474 #define SCF_TRIE_DOING_RESTUDY 0x10000
475 #define SCF_IN_DEFINE 0x20000
480 #define UTF cBOOL(RExC_utf8)
482 /* The enums for all these are ordered so things work out correctly */
483 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
484 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
485 == REGEX_DEPENDS_CHARSET)
486 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
487 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
488 >= REGEX_UNICODE_CHARSET)
489 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
490 == REGEX_ASCII_RESTRICTED_CHARSET)
491 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
492 >= REGEX_ASCII_RESTRICTED_CHARSET)
493 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
494 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
496 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
498 /* For programs that want to be strictly Unicode compatible by dying if any
499 * attempt is made to match a non-Unicode code point against a Unicode
501 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
503 #define OOB_NAMEDCLASS -1
505 /* There is no code point that is out-of-bounds, so this is problematic. But
506 * its only current use is to initialize a variable that is always set before
508 #define OOB_UNICODE 0xDEADBEEF
510 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
511 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
514 /* length of regex to show in messages that don't mark a position within */
515 #define RegexLengthToShowInErrorMessages 127
518 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
519 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
520 * op/pragma/warn/regcomp.
522 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
523 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
525 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
526 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
528 #define REPORT_LOCATION_ARGS(offset) \
529 UTF8fARG(UTF, offset, RExC_precomp), \
530 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
532 /* Used to point after bad bytes for an error message, but avoid skipping
533 * past a nul byte. */
534 #define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
537 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
538 * arg. Show regex, up to a maximum length. If it's too long, chop and add
541 #define _FAIL(code) STMT_START { \
542 const char *ellipses = ""; \
543 IV len = RExC_end - RExC_precomp; \
546 SAVEFREESV(RExC_rx_sv); \
547 if (len > RegexLengthToShowInErrorMessages) { \
548 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
549 len = RegexLengthToShowInErrorMessages - 10; \
555 #define FAIL(msg) _FAIL( \
556 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
557 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
559 #define FAIL2(msg,arg) _FAIL( \
560 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
561 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
564 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
566 #define Simple_vFAIL(m) STMT_START { \
568 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
569 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
570 m, REPORT_LOCATION_ARGS(offset)); \
574 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
576 #define vFAIL(m) STMT_START { \
578 SAVEFREESV(RExC_rx_sv); \
583 * Like Simple_vFAIL(), but accepts two arguments.
585 #define Simple_vFAIL2(m,a1) STMT_START { \
586 const IV offset = RExC_parse - RExC_precomp; \
587 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
588 REPORT_LOCATION_ARGS(offset)); \
592 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
594 #define vFAIL2(m,a1) STMT_START { \
596 SAVEFREESV(RExC_rx_sv); \
597 Simple_vFAIL2(m, a1); \
602 * Like Simple_vFAIL(), but accepts three arguments.
604 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
605 const IV offset = RExC_parse - RExC_precomp; \
606 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
607 REPORT_LOCATION_ARGS(offset)); \
611 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
613 #define vFAIL3(m,a1,a2) STMT_START { \
615 SAVEFREESV(RExC_rx_sv); \
616 Simple_vFAIL3(m, a1, a2); \
620 * Like Simple_vFAIL(), but accepts four arguments.
622 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
623 const IV offset = RExC_parse - RExC_precomp; \
624 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
625 REPORT_LOCATION_ARGS(offset)); \
628 #define vFAIL4(m,a1,a2,a3) STMT_START { \
630 SAVEFREESV(RExC_rx_sv); \
631 Simple_vFAIL4(m, a1, a2, a3); \
634 /* A specialized version of vFAIL2 that works with UTF8f */
635 #define vFAIL2utf8f(m, a1) STMT_START { \
636 const IV offset = RExC_parse - RExC_precomp; \
638 SAVEFREESV(RExC_rx_sv); \
639 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
640 REPORT_LOCATION_ARGS(offset)); \
643 /* These have asserts in them because of [perl #122671] Many warnings in
644 * regcomp.c can occur twice. If they get output in pass1 and later in that
645 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
646 * would get output again. So they should be output in pass2, and these
647 * asserts make sure new warnings follow that paradigm. */
649 /* m is not necessarily a "literal string", in this macro */
650 #define reg_warn_non_literal_string(loc, m) STMT_START { \
651 const IV offset = loc - RExC_precomp; \
652 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
653 m, REPORT_LOCATION_ARGS(offset)); \
656 #define ckWARNreg(loc,m) STMT_START { \
657 const IV offset = loc - RExC_precomp; \
658 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
659 REPORT_LOCATION_ARGS(offset)); \
662 #define vWARN(loc, m) STMT_START { \
663 const IV offset = loc - RExC_precomp; \
664 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
665 REPORT_LOCATION_ARGS(offset)); \
668 #define vWARN_dep(loc, m) STMT_START { \
669 const IV offset = loc - RExC_precomp; \
670 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
671 REPORT_LOCATION_ARGS(offset)); \
674 #define ckWARNdep(loc,m) STMT_START { \
675 const IV offset = loc - RExC_precomp; \
676 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
678 REPORT_LOCATION_ARGS(offset)); \
681 #define ckWARNregdep(loc,m) STMT_START { \
682 const IV offset = loc - RExC_precomp; \
683 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
685 REPORT_LOCATION_ARGS(offset)); \
688 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
689 const IV offset = loc - RExC_precomp; \
690 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
692 a1, REPORT_LOCATION_ARGS(offset)); \
695 #define ckWARN2reg(loc, m, a1) STMT_START { \
696 const IV offset = loc - RExC_precomp; \
697 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
698 a1, REPORT_LOCATION_ARGS(offset)); \
701 #define vWARN3(loc, m, a1, a2) STMT_START { \
702 const IV offset = loc - RExC_precomp; \
703 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
704 a1, a2, REPORT_LOCATION_ARGS(offset)); \
707 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
708 const IV offset = loc - RExC_precomp; \
709 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
710 a1, a2, REPORT_LOCATION_ARGS(offset)); \
713 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
714 const IV offset = loc - RExC_precomp; \
715 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
716 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
719 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
720 const IV offset = loc - RExC_precomp; \
721 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
722 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
725 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
726 const IV offset = loc - RExC_precomp; \
727 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
728 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
731 /* Macros for recording node offsets. 20001227 mjd@plover.com
732 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
733 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
734 * Element 0 holds the number n.
735 * Position is 1 indexed.
737 #ifndef RE_TRACK_PATTERN_OFFSETS
738 #define Set_Node_Offset_To_R(node,byte)
739 #define Set_Node_Offset(node,byte)
740 #define Set_Cur_Node_Offset
741 #define Set_Node_Length_To_R(node,len)
742 #define Set_Node_Length(node,len)
743 #define Set_Node_Cur_Length(node,start)
744 #define Node_Offset(n)
745 #define Node_Length(n)
746 #define Set_Node_Offset_Length(node,offset,len)
747 #define ProgLen(ri) ri->u.proglen
748 #define SetProgLen(ri,x) ri->u.proglen = x
750 #define ProgLen(ri) ri->u.offsets[0]
751 #define SetProgLen(ri,x) ri->u.offsets[0] = x
752 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
754 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
755 __LINE__, (int)(node), (int)(byte))); \
757 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
760 RExC_offsets[2*(node)-1] = (byte); \
765 #define Set_Node_Offset(node,byte) \
766 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
767 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
769 #define Set_Node_Length_To_R(node,len) STMT_START { \
771 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
772 __LINE__, (int)(node), (int)(len))); \
774 Perl_croak(aTHX_ "value of node is %d in Length macro", \
777 RExC_offsets[2*(node)] = (len); \
782 #define Set_Node_Length(node,len) \
783 Set_Node_Length_To_R((node)-RExC_emit_start, len)
784 #define Set_Node_Cur_Length(node, start) \
785 Set_Node_Length(node, RExC_parse - start)
787 /* Get offsets and lengths */
788 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
789 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
791 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
792 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
793 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
797 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
798 #define EXPERIMENTAL_INPLACESCAN
799 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
801 #define DEBUG_RExC_seen() \
802 DEBUG_OPTIMISE_MORE_r({ \
803 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
805 if (RExC_seen & REG_ZERO_LEN_SEEN) \
806 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
808 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
809 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
811 if (RExC_seen & REG_GPOS_SEEN) \
812 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
814 if (RExC_seen & REG_RECURSE_SEEN) \
815 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
817 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
818 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
820 if (RExC_seen & REG_VERBARG_SEEN) \
821 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
823 if (RExC_seen & REG_CUTGROUP_SEEN) \
824 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
826 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
827 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
829 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
830 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
832 if (RExC_seen & REG_GOSTART_SEEN) \
833 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
835 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
836 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
838 PerlIO_printf(Perl_debug_log,"\n"); \
841 #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
842 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
844 #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
846 PerlIO_printf(Perl_debug_log, "%s", open_str); \
847 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
848 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
849 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
850 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
851 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
852 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
853 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
854 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
855 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
856 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
857 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
858 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
859 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
860 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
861 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
862 PerlIO_printf(Perl_debug_log, "%s", close_str); \
866 #define DEBUG_STUDYDATA(str,data,depth) \
867 DEBUG_OPTIMISE_MORE_r(if(data){ \
868 PerlIO_printf(Perl_debug_log, \
869 "%*s" str "Pos:%"IVdf"/%"IVdf \
871 (int)(depth)*2, "", \
872 (IV)((data)->pos_min), \
873 (IV)((data)->pos_delta), \
874 (UV)((data)->flags) \
876 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
877 PerlIO_printf(Perl_debug_log, \
878 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
879 (IV)((data)->whilem_c), \
880 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
881 is_inf ? "INF " : "" \
883 if ((data)->last_found) \
884 PerlIO_printf(Perl_debug_log, \
885 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
886 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
887 SvPVX_const((data)->last_found), \
888 (IV)((data)->last_end), \
889 (IV)((data)->last_start_min), \
890 (IV)((data)->last_start_max), \
891 ((data)->longest && \
892 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
893 SvPVX_const((data)->longest_fixed), \
894 (IV)((data)->offset_fixed), \
895 ((data)->longest && \
896 (data)->longest==&((data)->longest_float)) ? "*" : "", \
897 SvPVX_const((data)->longest_float), \
898 (IV)((data)->offset_float_min), \
899 (IV)((data)->offset_float_max) \
901 PerlIO_printf(Perl_debug_log,"\n"); \
904 /* is c a control character for which we have a mnemonic? */
905 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
908 S_cntrl_to_mnemonic(const U8 c)
910 /* Returns the mnemonic string that represents character 'c', if one
911 * exists; NULL otherwise. The only ones that exist for the purposes of
912 * this routine are a few control characters */
915 case '\a': return "\\a";
916 case '\b': return "\\b";
917 case ESC_NATIVE: return "\\e";
918 case '\f': return "\\f";
919 case '\n': return "\\n";
920 case '\r': return "\\r";
921 case '\t': return "\\t";
927 /* Mark that we cannot extend a found fixed substring at this point.
928 Update the longest found anchored substring and the longest found
929 floating substrings if needed. */
932 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
933 SSize_t *minlenp, int is_inf)
935 const STRLEN l = CHR_SVLEN(data->last_found);
936 const STRLEN old_l = CHR_SVLEN(*data->longest);
937 GET_RE_DEBUG_FLAGS_DECL;
939 PERL_ARGS_ASSERT_SCAN_COMMIT;
941 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
942 SvSetMagicSV(*data->longest, data->last_found);
943 if (*data->longest == data->longest_fixed) {
944 data->offset_fixed = l ? data->last_start_min : data->pos_min;
945 if (data->flags & SF_BEFORE_EOL)
947 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
949 data->flags &= ~SF_FIX_BEFORE_EOL;
950 data->minlen_fixed=minlenp;
951 data->lookbehind_fixed=0;
953 else { /* *data->longest == data->longest_float */
954 data->offset_float_min = l ? data->last_start_min : data->pos_min;
955 data->offset_float_max = (l
956 ? data->last_start_max
957 : (data->pos_delta > SSize_t_MAX - data->pos_min
959 : data->pos_min + data->pos_delta));
961 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
962 data->offset_float_max = SSize_t_MAX;
963 if (data->flags & SF_BEFORE_EOL)
965 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
967 data->flags &= ~SF_FL_BEFORE_EOL;
968 data->minlen_float=minlenp;
969 data->lookbehind_float=0;
972 SvCUR_set(data->last_found, 0);
974 SV * const sv = data->last_found;
975 if (SvUTF8(sv) && SvMAGICAL(sv)) {
976 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
982 data->flags &= ~SF_BEFORE_EOL;
983 DEBUG_STUDYDATA("commit: ",data,0);
986 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
987 * list that describes which code points it matches */
990 S_ssc_anything(pTHX_ regnode_ssc *ssc)
992 /* Set the SSC 'ssc' to match an empty string or any code point */
994 PERL_ARGS_ASSERT_SSC_ANYTHING;
996 assert(is_ANYOF_SYNTHETIC(ssc));
998 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
999 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1000 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
1004 S_ssc_is_anything(const regnode_ssc *ssc)
1006 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1007 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1008 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1009 * in any way, so there's no point in using it */
1014 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
1016 assert(is_ANYOF_SYNTHETIC(ssc));
1018 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
1022 /* See if the list consists solely of the range 0 - Infinity */
1023 invlist_iterinit(ssc->invlist);
1024 ret = invlist_iternext(ssc->invlist, &start, &end)
1028 invlist_iterfinish(ssc->invlist);
1034 /* If e.g., both \w and \W are set, matches everything */
1035 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1037 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1038 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1048 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
1050 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1051 * string, any code point, or any posix class under locale */
1053 PERL_ARGS_ASSERT_SSC_INIT;
1055 Zero(ssc, 1, regnode_ssc);
1056 set_ANYOF_SYNTHETIC(ssc);
1057 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
1060 /* If any portion of the regex is to operate under locale rules that aren't
1061 * fully known at compile time, initialization includes it. The reason
1062 * this isn't done for all regexes is that the optimizer was written under
1063 * the assumption that locale was all-or-nothing. Given the complexity and
1064 * lack of documentation in the optimizer, and that there are inadequate
1065 * test cases for locale, many parts of it may not work properly, it is
1066 * safest to avoid locale unless necessary. */
1067 if (RExC_contains_locale) {
1068 ANYOF_POSIXL_SETALL(ssc);
1071 ANYOF_POSIXL_ZERO(ssc);
1076 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1077 const regnode_ssc *ssc)
1079 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1080 * to the list of code points matched, and locale posix classes; hence does
1081 * not check its flags) */
1086 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1088 assert(is_ANYOF_SYNTHETIC(ssc));
1090 invlist_iterinit(ssc->invlist);
1091 ret = invlist_iternext(ssc->invlist, &start, &end)
1095 invlist_iterfinish(ssc->invlist);
1101 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1109 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1110 const regnode_charclass* const node)
1112 /* Returns a mortal inversion list defining which code points are matched
1113 * by 'node', which is of type ANYOF. Handles complementing the result if
1114 * appropriate. If some code points aren't knowable at this time, the
1115 * returned list must, and will, contain every code point that is a
1118 SV* invlist = sv_2mortal(_new_invlist(0));
1119 SV* only_utf8_locale_invlist = NULL;
1121 const U32 n = ARG(node);
1122 bool new_node_has_latin1 = FALSE;
1124 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1126 /* Look at the data structure created by S_set_ANYOF_arg() */
1127 if (n != ANYOF_ONLY_HAS_BITMAP) {
1128 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1129 AV * const av = MUTABLE_AV(SvRV(rv));
1130 SV **const ary = AvARRAY(av);
1131 assert(RExC_rxi->data->what[n] == 's');
1133 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1134 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1136 else if (ary[0] && ary[0] != &PL_sv_undef) {
1138 /* Here, no compile-time swash, and there are things that won't be
1139 * known until runtime -- we have to assume it could be anything */
1140 return _add_range_to_invlist(invlist, 0, UV_MAX);
1142 else if (ary[3] && ary[3] != &PL_sv_undef) {
1144 /* Here no compile-time swash, and no run-time only data. Use the
1145 * node's inversion list */
1146 invlist = sv_2mortal(invlist_clone(ary[3]));
1149 /* Get the code points valid only under UTF-8 locales */
1150 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1151 && ary[2] && ary[2] != &PL_sv_undef)
1153 only_utf8_locale_invlist = ary[2];
1157 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1158 * code points, and an inversion list for the others, but if there are code
1159 * points that should match only conditionally on the target string being
1160 * UTF-8, those are placed in the inversion list, and not the bitmap.
1161 * Since there are circumstances under which they could match, they are
1162 * included in the SSC. But if the ANYOF node is to be inverted, we have
1163 * to exclude them here, so that when we invert below, the end result
1164 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1165 * have to do this here before we add the unconditionally matched code
1167 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1168 _invlist_intersection_complement_2nd(invlist,
1173 /* Add in the points from the bit map */
1174 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1175 if (ANYOF_BITMAP_TEST(node, i)) {
1176 invlist = add_cp_to_invlist(invlist, i);
1177 new_node_has_latin1 = TRUE;
1181 /* If this can match all upper Latin1 code points, have to add them
1183 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1184 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1187 /* Similarly for these */
1188 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1189 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1192 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1193 _invlist_invert(invlist);
1195 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1197 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1198 * locale. We can skip this if there are no 0-255 at all. */
1199 _invlist_union(invlist, PL_Latin1, &invlist);
1202 /* Similarly add the UTF-8 locale possible matches. These have to be
1203 * deferred until after the non-UTF-8 locale ones are taken care of just
1204 * above, or it leads to wrong results under ANYOF_INVERT */
1205 if (only_utf8_locale_invlist) {
1206 _invlist_union_maybe_complement_2nd(invlist,
1207 only_utf8_locale_invlist,
1208 ANYOF_FLAGS(node) & ANYOF_INVERT,
1215 /* These two functions currently do the exact same thing */
1216 #define ssc_init_zero ssc_init
1218 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1219 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1221 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1222 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1223 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1226 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1227 const regnode_charclass *and_with)
1229 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1230 * another SSC or a regular ANYOF class. Can create false positives. */
1235 PERL_ARGS_ASSERT_SSC_AND;
1237 assert(is_ANYOF_SYNTHETIC(ssc));
1239 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1240 * the code point inversion list and just the relevant flags */
1241 if (is_ANYOF_SYNTHETIC(and_with)) {
1242 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1243 anded_flags = ANYOF_FLAGS(and_with);
1245 /* XXX This is a kludge around what appears to be deficiencies in the
1246 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1247 * there are paths through the optimizer where it doesn't get weeded
1248 * out when it should. And if we don't make some extra provision for
1249 * it like the code just below, it doesn't get added when it should.
1250 * This solution is to add it only when AND'ing, which is here, and
1251 * only when what is being AND'ed is the pristine, original node
1252 * matching anything. Thus it is like adding it to ssc_anything() but
1253 * only when the result is to be AND'ed. Probably the same solution
1254 * could be adopted for the same problem we have with /l matching,
1255 * which is solved differently in S_ssc_init(), and that would lead to
1256 * fewer false positives than that solution has. But if this solution
1257 * creates bugs, the consequences are only that a warning isn't raised
1258 * that should be; while the consequences for having /l bugs is
1259 * incorrect matches */
1260 if (ssc_is_anything((regnode_ssc *)and_with)) {
1261 anded_flags |= ANYOF_WARN_SUPER;
1265 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1266 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1269 ANYOF_FLAGS(ssc) &= anded_flags;
1271 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1272 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1273 * 'and_with' may be inverted. When not inverted, we have the situation of
1275 * (C1 | P1) & (C2 | P2)
1276 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1277 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1278 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1279 * <= ((C1 & C2) | P1 | P2)
1280 * Alternatively, the last few steps could be:
1281 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1282 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1283 * <= (C1 | C2 | (P1 & P2))
1284 * We favor the second approach if either P1 or P2 is non-empty. This is
1285 * because these components are a barrier to doing optimizations, as what
1286 * they match cannot be known until the moment of matching as they are
1287 * dependent on the current locale, 'AND"ing them likely will reduce or
1289 * But we can do better if we know that C1,P1 are in their initial state (a
1290 * frequent occurrence), each matching everything:
1291 * (<everything>) & (C2 | P2) = C2 | P2
1292 * Similarly, if C2,P2 are in their initial state (again a frequent
1293 * occurrence), the result is a no-op
1294 * (C1 | P1) & (<everything>) = C1 | P1
1297 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1298 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1299 * <= (C1 & ~C2) | (P1 & ~P2)
1302 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1303 && ! is_ANYOF_SYNTHETIC(and_with))
1307 ssc_intersection(ssc,
1309 FALSE /* Has already been inverted */
1312 /* If either P1 or P2 is empty, the intersection will be also; can skip
1314 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1315 ANYOF_POSIXL_ZERO(ssc);
1317 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1319 /* Note that the Posix class component P from 'and_with' actually
1321 * P = Pa | Pb | ... | Pn
1322 * where each component is one posix class, such as in [\w\s].
1324 * ~P = ~(Pa | Pb | ... | Pn)
1325 * = ~Pa & ~Pb & ... & ~Pn
1326 * <= ~Pa | ~Pb | ... | ~Pn
1327 * The last is something we can easily calculate, but unfortunately
1328 * is likely to have many false positives. We could do better
1329 * in some (but certainly not all) instances if two classes in
1330 * P have known relationships. For example
1331 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1333 * :lower: & :print: = :lower:
1334 * And similarly for classes that must be disjoint. For example,
1335 * since \s and \w can have no elements in common based on rules in
1336 * the POSIX standard,
1337 * \w & ^\S = nothing
1338 * Unfortunately, some vendor locales do not meet the Posix
1339 * standard, in particular almost everything by Microsoft.
1340 * The loop below just changes e.g., \w into \W and vice versa */
1342 regnode_charclass_posixl temp;
1343 int add = 1; /* To calculate the index of the complement */
1345 ANYOF_POSIXL_ZERO(&temp);
1346 for (i = 0; i < ANYOF_MAX; i++) {
1348 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1349 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1351 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1352 ANYOF_POSIXL_SET(&temp, i + add);
1354 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1356 ANYOF_POSIXL_AND(&temp, ssc);
1358 } /* else ssc already has no posixes */
1359 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1360 in its initial state */
1361 else if (! is_ANYOF_SYNTHETIC(and_with)
1362 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1364 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1365 * copy it over 'ssc' */
1366 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1367 if (is_ANYOF_SYNTHETIC(and_with)) {
1368 StructCopy(and_with, ssc, regnode_ssc);
1371 ssc->invlist = anded_cp_list;
1372 ANYOF_POSIXL_ZERO(ssc);
1373 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1374 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1378 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1379 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1381 /* One or the other of P1, P2 is non-empty. */
1382 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1383 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1385 ssc_union(ssc, anded_cp_list, FALSE);
1387 else { /* P1 = P2 = empty */
1388 ssc_intersection(ssc, anded_cp_list, FALSE);
1394 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1395 const regnode_charclass *or_with)
1397 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1398 * another SSC or a regular ANYOF class. Can create false positives if
1399 * 'or_with' is to be inverted. */
1404 PERL_ARGS_ASSERT_SSC_OR;
1406 assert(is_ANYOF_SYNTHETIC(ssc));
1408 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1409 * the code point inversion list and just the relevant flags */
1410 if (is_ANYOF_SYNTHETIC(or_with)) {
1411 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1412 ored_flags = ANYOF_FLAGS(or_with);
1415 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1416 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1419 ANYOF_FLAGS(ssc) |= ored_flags;
1421 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1422 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1423 * 'or_with' may be inverted. When not inverted, we have the simple
1424 * situation of computing:
1425 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1426 * If P1|P2 yields a situation with both a class and its complement are
1427 * set, like having both \w and \W, this matches all code points, and we
1428 * can delete these from the P component of the ssc going forward. XXX We
1429 * might be able to delete all the P components, but I (khw) am not certain
1430 * about this, and it is better to be safe.
1433 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1434 * <= (C1 | P1) | ~C2
1435 * <= (C1 | ~C2) | P1
1436 * (which results in actually simpler code than the non-inverted case)
1439 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1440 && ! is_ANYOF_SYNTHETIC(or_with))
1442 /* We ignore P2, leaving P1 going forward */
1443 } /* else Not inverted */
1444 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1445 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1446 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1448 for (i = 0; i < ANYOF_MAX; i += 2) {
1449 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1451 ssc_match_all_cp(ssc);
1452 ANYOF_POSIXL_CLEAR(ssc, i);
1453 ANYOF_POSIXL_CLEAR(ssc, i+1);
1461 FALSE /* Already has been inverted */
1465 PERL_STATIC_INLINE void
1466 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1468 PERL_ARGS_ASSERT_SSC_UNION;
1470 assert(is_ANYOF_SYNTHETIC(ssc));
1472 _invlist_union_maybe_complement_2nd(ssc->invlist,
1478 PERL_STATIC_INLINE void
1479 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1481 const bool invert2nd)
1483 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1485 assert(is_ANYOF_SYNTHETIC(ssc));
1487 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1493 PERL_STATIC_INLINE void
1494 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1496 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1498 assert(is_ANYOF_SYNTHETIC(ssc));
1500 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1503 PERL_STATIC_INLINE void
1504 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1506 /* AND just the single code point 'cp' into the SSC 'ssc' */
1508 SV* cp_list = _new_invlist(2);
1510 PERL_ARGS_ASSERT_SSC_CP_AND;
1512 assert(is_ANYOF_SYNTHETIC(ssc));
1514 cp_list = add_cp_to_invlist(cp_list, cp);
1515 ssc_intersection(ssc, cp_list,
1516 FALSE /* Not inverted */
1518 SvREFCNT_dec_NN(cp_list);
1521 PERL_STATIC_INLINE void
1522 S_ssc_clear_locale(regnode_ssc *ssc)
1524 /* Set the SSC 'ssc' to not match any locale things */
1525 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1527 assert(is_ANYOF_SYNTHETIC(ssc));
1529 ANYOF_POSIXL_ZERO(ssc);
1530 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1533 #define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1536 S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1538 /* The synthetic start class is used to hopefully quickly winnow down
1539 * places where a pattern could start a match in the target string. If it
1540 * doesn't really narrow things down that much, there isn't much point to
1541 * having the overhead of using it. This function uses some very crude
1542 * heuristics to decide if to use the ssc or not.
1544 * It returns TRUE if 'ssc' rules out more than half what it considers to
1545 * be the "likely" possible matches, but of course it doesn't know what the
1546 * actual things being matched are going to be; these are only guesses
1548 * For /l matches, it assumes that the only likely matches are going to be
1549 * in the 0-255 range, uniformly distributed, so half of that is 127
1550 * For /a and /d matches, it assumes that the likely matches will be just
1551 * the ASCII range, so half of that is 63
1552 * For /u and there isn't anything matching above the Latin1 range, it
1553 * assumes that that is the only range likely to be matched, and uses
1554 * half that as the cut-off: 127. If anything matches above Latin1,
1555 * it assumes that all of Unicode could match (uniformly), except for
1556 * non-Unicode code points and things in the General Category "Other"
1557 * (unassigned, private use, surrogates, controls and formats). This
1558 * is a much large number. */
1560 const U32 max_match = (LOC)
1564 : (invlist_highest(ssc->invlist) < 256)
1566 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1567 U32 count = 0; /* Running total of number of code points matched by
1569 UV start, end; /* Start and end points of current range in inversion
1572 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1574 invlist_iterinit(ssc->invlist);
1575 while (invlist_iternext(ssc->invlist, &start, &end)) {
1577 /* /u is the only thing that we expect to match above 255; so if not /u
1578 * and even if there are matches above 255, ignore them. This catches
1579 * things like \d under /d which does match the digits above 255, but
1580 * since the pattern is /d, it is not likely to be expecting them */
1581 if (! UNI_SEMANTICS) {
1585 end = MIN(end, 255);
1587 count += end - start + 1;
1588 if (count > max_match) {
1589 invlist_iterfinish(ssc->invlist);
1599 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1601 /* The inversion list in the SSC is marked mortal; now we need a more
1602 * permanent copy, which is stored the same way that is done in a regular
1603 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1606 SV* invlist = invlist_clone(ssc->invlist);
1608 PERL_ARGS_ASSERT_SSC_FINALIZE;
1610 assert(is_ANYOF_SYNTHETIC(ssc));
1612 /* The code in this file assumes that all but these flags aren't relevant
1613 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1614 * by the time we reach here */
1615 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1617 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1619 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1620 NULL, NULL, NULL, FALSE);
1622 /* Make sure is clone-safe */
1623 ssc->invlist = NULL;
1625 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1626 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1629 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1632 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1633 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1634 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1635 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1636 ? (TRIE_LIST_CUR( idx ) - 1) \
1642 dump_trie(trie,widecharmap,revcharmap)
1643 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1644 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1646 These routines dump out a trie in a somewhat readable format.
1647 The _interim_ variants are used for debugging the interim
1648 tables that are used to generate the final compressed
1649 representation which is what dump_trie expects.
1651 Part of the reason for their existence is to provide a form
1652 of documentation as to how the different representations function.
1657 Dumps the final compressed table form of the trie to Perl_debug_log.
1658 Used for debugging make_trie().
1662 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1663 AV *revcharmap, U32 depth)
1666 SV *sv=sv_newmortal();
1667 int colwidth= widecharmap ? 6 : 4;
1669 GET_RE_DEBUG_FLAGS_DECL;
1671 PERL_ARGS_ASSERT_DUMP_TRIE;
1673 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1674 (int)depth * 2 + 2,"",
1675 "Match","Base","Ofs" );
1677 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1678 SV ** const tmp = av_fetch( revcharmap, state, 0);
1680 PerlIO_printf( Perl_debug_log, "%*s",
1682 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1683 PL_colors[0], PL_colors[1],
1684 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1685 PERL_PV_ESCAPE_FIRSTCHAR
1690 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1691 (int)depth * 2 + 2,"");
1693 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1694 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1695 PerlIO_printf( Perl_debug_log, "\n");
1697 for( state = 1 ; state < trie->statecount ; state++ ) {
1698 const U32 base = trie->states[ state ].trans.base;
1700 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1701 (int)depth * 2 + 2,"", (UV)state);
1703 if ( trie->states[ state ].wordnum ) {
1704 PerlIO_printf( Perl_debug_log, " W%4X",
1705 trie->states[ state ].wordnum );
1707 PerlIO_printf( Perl_debug_log, "%6s", "" );
1710 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1715 while( ( base + ofs < trie->uniquecharcount ) ||
1716 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1717 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1721 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1723 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1724 if ( ( base + ofs >= trie->uniquecharcount )
1725 && ( base + ofs - trie->uniquecharcount
1727 && trie->trans[ base + ofs
1728 - trie->uniquecharcount ].check == state )
1730 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1732 (UV)trie->trans[ base + ofs
1733 - trie->uniquecharcount ].next );
1735 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1739 PerlIO_printf( Perl_debug_log, "]");
1742 PerlIO_printf( Perl_debug_log, "\n" );
1744 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1746 for (word=1; word <= trie->wordcount; word++) {
1747 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1748 (int)word, (int)(trie->wordinfo[word].prev),
1749 (int)(trie->wordinfo[word].len));
1751 PerlIO_printf(Perl_debug_log, "\n" );
1754 Dumps a fully constructed but uncompressed trie in list form.
1755 List tries normally only are used for construction when the number of
1756 possible chars (trie->uniquecharcount) is very high.
1757 Used for debugging make_trie().
1760 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1761 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1765 SV *sv=sv_newmortal();
1766 int colwidth= widecharmap ? 6 : 4;
1767 GET_RE_DEBUG_FLAGS_DECL;
1769 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1771 /* print out the table precompression. */
1772 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1773 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1774 "------:-----+-----------------\n" );
1776 for( state=1 ; state < next_alloc ; state ++ ) {
1779 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1780 (int)depth * 2 + 2,"", (UV)state );
1781 if ( ! trie->states[ state ].wordnum ) {
1782 PerlIO_printf( Perl_debug_log, "%5s| ","");
1784 PerlIO_printf( Perl_debug_log, "W%4x| ",
1785 trie->states[ state ].wordnum
1788 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1789 SV ** const tmp = av_fetch( revcharmap,
1790 TRIE_LIST_ITEM(state,charid).forid, 0);
1792 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1794 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1796 PL_colors[0], PL_colors[1],
1797 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1798 | PERL_PV_ESCAPE_FIRSTCHAR
1800 TRIE_LIST_ITEM(state,charid).forid,
1801 (UV)TRIE_LIST_ITEM(state,charid).newstate
1804 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1805 (int)((depth * 2) + 14), "");
1808 PerlIO_printf( Perl_debug_log, "\n");
1813 Dumps a fully constructed but uncompressed trie in table form.
1814 This is the normal DFA style state transition table, with a few
1815 twists to facilitate compression later.
1816 Used for debugging make_trie().
1819 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1820 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1825 SV *sv=sv_newmortal();
1826 int colwidth= widecharmap ? 6 : 4;
1827 GET_RE_DEBUG_FLAGS_DECL;
1829 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1832 print out the table precompression so that we can do a visual check
1833 that they are identical.
1836 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1838 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1839 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1841 PerlIO_printf( Perl_debug_log, "%*s",
1843 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1844 PL_colors[0], PL_colors[1],
1845 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1846 PERL_PV_ESCAPE_FIRSTCHAR
1852 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1854 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1855 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1858 PerlIO_printf( Perl_debug_log, "\n" );
1860 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1862 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1863 (int)depth * 2 + 2,"",
1864 (UV)TRIE_NODENUM( state ) );
1866 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1867 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1869 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1871 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1873 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1874 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1875 (UV)trie->trans[ state ].check );
1877 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1878 (UV)trie->trans[ state ].check,
1879 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1887 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1888 startbranch: the first branch in the whole branch sequence
1889 first : start branch of sequence of branch-exact nodes.
1890 May be the same as startbranch
1891 last : Thing following the last branch.
1892 May be the same as tail.
1893 tail : item following the branch sequence
1894 count : words in the sequence
1895 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
1896 depth : indent depth
1898 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1900 A trie is an N'ary tree where the branches are determined by digital
1901 decomposition of the key. IE, at the root node you look up the 1st character and
1902 follow that branch repeat until you find the end of the branches. Nodes can be
1903 marked as "accepting" meaning they represent a complete word. Eg:
1907 would convert into the following structure. Numbers represent states, letters
1908 following numbers represent valid transitions on the letter from that state, if
1909 the number is in square brackets it represents an accepting state, otherwise it
1910 will be in parenthesis.
1912 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1916 (1) +-i->(6)-+-s->[7]
1918 +-s->(3)-+-h->(4)-+-e->[5]
1920 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1922 This shows that when matching against the string 'hers' we will begin at state 1
1923 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1924 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1925 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1926 single traverse. We store a mapping from accepting to state to which word was
1927 matched, and then when we have multiple possibilities we try to complete the
1928 rest of the regex in the order in which they occurred in the alternation.
1930 The only prior NFA like behaviour that would be changed by the TRIE support is
1931 the silent ignoring of duplicate alternations which are of the form:
1933 / (DUPE|DUPE) X? (?{ ... }) Y /x
1935 Thus EVAL blocks following a trie may be called a different number of times with
1936 and without the optimisation. With the optimisations dupes will be silently
1937 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1938 the following demonstrates:
1940 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1942 which prints out 'word' three times, but
1944 'words'=~/(word|word|word)(?{ print $1 })S/
1946 which doesnt print it out at all. This is due to other optimisations kicking in.
1948 Example of what happens on a structural level:
1950 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1952 1: CURLYM[1] {1,32767}(18)
1963 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1964 and should turn into:
1966 1: CURLYM[1] {1,32767}(18)
1968 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1976 Cases where tail != last would be like /(?foo|bar)baz/:
1986 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1987 and would end up looking like:
1990 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1997 d = uvchr_to_utf8_flags(d, uv, 0);
1999 is the recommended Unicode-aware way of saying
2004 #define TRIE_STORE_REVCHAR(val) \
2007 SV *zlopp = newSV(7); /* XXX: optimize me */ \
2008 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
2009 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2010 SvCUR_set(zlopp, kapow - flrbbbbb); \
2013 av_push(revcharmap, zlopp); \
2015 char ooooff = (char)val; \
2016 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2020 /* This gets the next character from the input, folding it if not already
2022 #define TRIE_READ_CHAR STMT_START { \
2025 /* if it is UTF then it is either already folded, or does not need \
2027 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
2029 else if (folder == PL_fold_latin1) { \
2030 /* This folder implies Unicode rules, which in the range expressible \
2031 * by not UTF is the lower case, with the two exceptions, one of \
2032 * which should have been taken care of before calling this */ \
2033 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2034 uvc = toLOWER_L1(*uc); \
2035 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2038 /* raw data, will be folded later if needed */ \
2046 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2047 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
2048 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2049 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
2051 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2052 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2053 TRIE_LIST_CUR( state )++; \
2056 #define TRIE_LIST_NEW(state) STMT_START { \
2057 Newxz( trie->states[ state ].trans.list, \
2058 4, reg_trie_trans_le ); \
2059 TRIE_LIST_CUR( state ) = 1; \
2060 TRIE_LIST_LEN( state ) = 4; \
2063 #define TRIE_HANDLE_WORD(state) STMT_START { \
2064 U16 dupe= trie->states[ state ].wordnum; \
2065 regnode * const noper_next = regnext( noper ); \
2068 /* store the word for dumping */ \
2070 if (OP(noper) != NOTHING) \
2071 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
2073 tmp = newSVpvn_utf8( "", 0, UTF ); \
2074 av_push( trie_words, tmp ); \
2078 trie->wordinfo[curword].prev = 0; \
2079 trie->wordinfo[curword].len = wordlen; \
2080 trie->wordinfo[curword].accept = state; \
2082 if ( noper_next < tail ) { \
2084 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2086 trie->jump[curword] = (U16)(noper_next - convert); \
2088 jumper = noper_next; \
2090 nextbranch= regnext(cur); \
2094 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2095 /* chain, so that when the bits of chain are later */\
2096 /* linked together, the dups appear in the chain */\
2097 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2098 trie->wordinfo[dupe].prev = curword; \
2100 /* we haven't inserted this word yet. */ \
2101 trie->states[ state ].wordnum = curword; \
2106 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2107 ( ( base + charid >= ucharcount \
2108 && base + charid < ubound \
2109 && state == trie->trans[ base - ucharcount + charid ].check \
2110 && trie->trans[ base - ucharcount + charid ].next ) \
2111 ? trie->trans[ base - ucharcount + charid ].next \
2112 : ( state==1 ? special : 0 ) \
2116 #define MADE_JUMP_TRIE 2
2117 #define MADE_EXACT_TRIE 4
2120 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2121 regnode *first, regnode *last, regnode *tail,
2122 U32 word_count, U32 flags, U32 depth)
2124 /* first pass, loop through and scan words */
2125 reg_trie_data *trie;
2126 HV *widecharmap = NULL;
2127 AV *revcharmap = newAV();
2133 regnode *jumper = NULL;
2134 regnode *nextbranch = NULL;
2135 regnode *convert = NULL;
2136 U32 *prev_states; /* temp array mapping each state to previous one */
2137 /* we just use folder as a flag in utf8 */
2138 const U8 * folder = NULL;
2141 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2142 AV *trie_words = NULL;
2143 /* along with revcharmap, this only used during construction but both are
2144 * useful during debugging so we store them in the struct when debugging.
2147 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
2148 STRLEN trie_charcount=0;
2150 SV *re_trie_maxbuff;
2151 GET_RE_DEBUG_FLAGS_DECL;
2153 PERL_ARGS_ASSERT_MAKE_TRIE;
2155 PERL_UNUSED_ARG(depth);
2159 case EXACT: case EXACTL: break;
2163 case EXACTFLU8: folder = PL_fold_latin1; break;
2164 case EXACTF: folder = PL_fold; break;
2165 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2168 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2170 trie->startstate = 1;
2171 trie->wordcount = word_count;
2172 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2173 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2174 if (flags == EXACT || flags == EXACTL)
2175 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2176 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2177 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2180 trie_words = newAV();
2183 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2184 assert(re_trie_maxbuff);
2185 if (!SvIOK(re_trie_maxbuff)) {
2186 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2188 DEBUG_TRIE_COMPILE_r({
2189 PerlIO_printf( Perl_debug_log,
2190 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2191 (int)depth * 2 + 2, "",
2192 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2193 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2196 /* Find the node we are going to overwrite */
2197 if ( first == startbranch && OP( last ) != BRANCH ) {
2198 /* whole branch chain */
2201 /* branch sub-chain */
2202 convert = NEXTOPER( first );
2205 /* -- First loop and Setup --
2207 We first traverse the branches and scan each word to determine if it
2208 contains widechars, and how many unique chars there are, this is
2209 important as we have to build a table with at least as many columns as we
2212 We use an array of integers to represent the character codes 0..255
2213 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2214 the native representation of the character value as the key and IV's for
2217 *TODO* If we keep track of how many times each character is used we can
2218 remap the columns so that the table compression later on is more
2219 efficient in terms of memory by ensuring the most common value is in the
2220 middle and the least common are on the outside. IMO this would be better
2221 than a most to least common mapping as theres a decent chance the most
2222 common letter will share a node with the least common, meaning the node
2223 will not be compressible. With a middle is most common approach the worst
2224 case is when we have the least common nodes twice.
2228 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2229 regnode *noper = NEXTOPER( cur );
2230 const U8 *uc = (U8*)STRING( noper );
2231 const U8 *e = uc + STR_LEN( noper );
2233 U32 wordlen = 0; /* required init */
2234 STRLEN minchars = 0;
2235 STRLEN maxchars = 0;
2236 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2239 if (OP(noper) == NOTHING) {
2240 regnode *noper_next= regnext(noper);
2241 if (noper_next != tail && OP(noper_next) == flags) {
2243 uc= (U8*)STRING(noper);
2244 e= uc + STR_LEN(noper);
2245 trie->minlen= STR_LEN(noper);
2252 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2253 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2254 regardless of encoding */
2255 if (OP( noper ) == EXACTFU_SS) {
2256 /* false positives are ok, so just set this */
2257 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2260 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2262 TRIE_CHARCOUNT(trie)++;
2265 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2266 * is in effect. Under /i, this character can match itself, or
2267 * anything that folds to it. If not under /i, it can match just
2268 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2269 * all fold to k, and all are single characters. But some folds
2270 * expand to more than one character, so for example LATIN SMALL
2271 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2272 * the string beginning at 'uc' is 'ffi', it could be matched by
2273 * three characters, or just by the one ligature character. (It
2274 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2275 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2276 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2277 * match.) The trie needs to know the minimum and maximum number
2278 * of characters that could match so that it can use size alone to
2279 * quickly reject many match attempts. The max is simple: it is
2280 * the number of folded characters in this branch (since a fold is
2281 * never shorter than what folds to it. */
2285 /* And the min is equal to the max if not under /i (indicated by
2286 * 'folder' being NULL), or there are no multi-character folds. If
2287 * there is a multi-character fold, the min is incremented just
2288 * once, for the character that folds to the sequence. Each
2289 * character in the sequence needs to be added to the list below of
2290 * characters in the trie, but we count only the first towards the
2291 * min number of characters needed. This is done through the
2292 * variable 'foldlen', which is returned by the macros that look
2293 * for these sequences as the number of bytes the sequence
2294 * occupies. Each time through the loop, we decrement 'foldlen' by
2295 * how many bytes the current char occupies. Only when it reaches
2296 * 0 do we increment 'minchars' or look for another multi-character
2298 if (folder == NULL) {
2301 else if (foldlen > 0) {
2302 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2307 /* See if *uc is the beginning of a multi-character fold. If
2308 * so, we decrement the length remaining to look at, to account
2309 * for the current character this iteration. (We can use 'uc'
2310 * instead of the fold returned by TRIE_READ_CHAR because for
2311 * non-UTF, the latin1_safe macro is smart enough to account
2312 * for all the unfolded characters, and because for UTF, the
2313 * string will already have been folded earlier in the
2314 * compilation process */
2316 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2317 foldlen -= UTF8SKIP(uc);
2320 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2325 /* The current character (and any potential folds) should be added
2326 * to the possible matching characters for this position in this
2330 U8 folded= folder[ (U8) uvc ];
2331 if ( !trie->charmap[ folded ] ) {
2332 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2333 TRIE_STORE_REVCHAR( folded );
2336 if ( !trie->charmap[ uvc ] ) {
2337 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2338 TRIE_STORE_REVCHAR( uvc );
2341 /* store the codepoint in the bitmap, and its folded
2343 TRIE_BITMAP_SET(trie, uvc);
2345 /* store the folded codepoint */
2346 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2349 /* store first byte of utf8 representation of
2350 variant codepoints */
2351 if (! UVCHR_IS_INVARIANT(uvc)) {
2352 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2355 set_bit = 0; /* We've done our bit :-) */
2359 /* XXX We could come up with the list of code points that fold
2360 * to this using PL_utf8_foldclosures, except not for
2361 * multi-char folds, as there may be multiple combinations
2362 * there that could work, which needs to wait until runtime to
2363 * resolve (The comment about LIGATURE FFI above is such an
2368 widecharmap = newHV();
2370 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2373 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2375 if ( !SvTRUE( *svpp ) ) {
2376 sv_setiv( *svpp, ++trie->uniquecharcount );
2377 TRIE_STORE_REVCHAR(uvc);
2380 } /* end loop through characters in this branch of the trie */
2382 /* We take the min and max for this branch and combine to find the min
2383 * and max for all branches processed so far */
2384 if( cur == first ) {
2385 trie->minlen = minchars;
2386 trie->maxlen = maxchars;
2387 } else if (minchars < trie->minlen) {
2388 trie->minlen = minchars;
2389 } else if (maxchars > trie->maxlen) {
2390 trie->maxlen = maxchars;
2392 } /* end first pass */
2393 DEBUG_TRIE_COMPILE_r(
2394 PerlIO_printf( Perl_debug_log,
2395 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2396 (int)depth * 2 + 2,"",
2397 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2398 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2399 (int)trie->minlen, (int)trie->maxlen )
2403 We now know what we are dealing with in terms of unique chars and
2404 string sizes so we can calculate how much memory a naive
2405 representation using a flat table will take. If it's over a reasonable
2406 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2407 conservative but potentially much slower representation using an array
2410 At the end we convert both representations into the same compressed
2411 form that will be used in regexec.c for matching with. The latter
2412 is a form that cannot be used to construct with but has memory
2413 properties similar to the list form and access properties similar
2414 to the table form making it both suitable for fast searches and
2415 small enough that its feasable to store for the duration of a program.
2417 See the comment in the code where the compressed table is produced
2418 inplace from the flat tabe representation for an explanation of how
2419 the compression works.
2424 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2427 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2428 > SvIV(re_trie_maxbuff) )
2431 Second Pass -- Array Of Lists Representation
2433 Each state will be represented by a list of charid:state records
2434 (reg_trie_trans_le) the first such element holds the CUR and LEN
2435 points of the allocated array. (See defines above).
2437 We build the initial structure using the lists, and then convert
2438 it into the compressed table form which allows faster lookups
2439 (but cant be modified once converted).
2442 STRLEN transcount = 1;
2444 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2445 "%*sCompiling trie using list compiler\n",
2446 (int)depth * 2 + 2, ""));
2448 trie->states = (reg_trie_state *)
2449 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2450 sizeof(reg_trie_state) );
2454 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2456 regnode *noper = NEXTOPER( cur );
2457 U8 *uc = (U8*)STRING( noper );
2458 const U8 *e = uc + STR_LEN( noper );
2459 U32 state = 1; /* required init */
2460 U16 charid = 0; /* sanity init */
2461 U32 wordlen = 0; /* required init */
2463 if (OP(noper) == NOTHING) {
2464 regnode *noper_next= regnext(noper);
2465 if (noper_next != tail && OP(noper_next) == flags) {
2467 uc= (U8*)STRING(noper);
2468 e= uc + STR_LEN(noper);
2472 if (OP(noper) != NOTHING) {
2473 for ( ; uc < e ; uc += len ) {
2478 charid = trie->charmap[ uvc ];
2480 SV** const svpp = hv_fetch( widecharmap,
2487 charid=(U16)SvIV( *svpp );
2490 /* charid is now 0 if we dont know the char read, or
2491 * nonzero if we do */
2498 if ( !trie->states[ state ].trans.list ) {
2499 TRIE_LIST_NEW( state );
2502 check <= TRIE_LIST_USED( state );
2505 if ( TRIE_LIST_ITEM( state, check ).forid
2508 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2513 newstate = next_alloc++;
2514 prev_states[newstate] = state;
2515 TRIE_LIST_PUSH( state, charid, newstate );
2520 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2524 TRIE_HANDLE_WORD(state);
2526 } /* end second pass */
2528 /* next alloc is the NEXT state to be allocated */
2529 trie->statecount = next_alloc;
2530 trie->states = (reg_trie_state *)
2531 PerlMemShared_realloc( trie->states,
2533 * sizeof(reg_trie_state) );
2535 /* and now dump it out before we compress it */
2536 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2537 revcharmap, next_alloc,
2541 trie->trans = (reg_trie_trans *)
2542 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2549 for( state=1 ; state < next_alloc ; state ++ ) {
2553 DEBUG_TRIE_COMPILE_MORE_r(
2554 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2558 if (trie->states[state].trans.list) {
2559 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2563 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2564 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2565 if ( forid < minid ) {
2567 } else if ( forid > maxid ) {
2571 if ( transcount < tp + maxid - minid + 1) {
2573 trie->trans = (reg_trie_trans *)
2574 PerlMemShared_realloc( trie->trans,
2576 * sizeof(reg_trie_trans) );
2577 Zero( trie->trans + (transcount / 2),
2581 base = trie->uniquecharcount + tp - minid;
2582 if ( maxid == minid ) {
2584 for ( ; zp < tp ; zp++ ) {
2585 if ( ! trie->trans[ zp ].next ) {
2586 base = trie->uniquecharcount + zp - minid;
2587 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2589 trie->trans[ zp ].check = state;
2595 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2597 trie->trans[ tp ].check = state;
2602 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2603 const U32 tid = base
2604 - trie->uniquecharcount
2605 + TRIE_LIST_ITEM( state, idx ).forid;
2606 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2608 trie->trans[ tid ].check = state;
2610 tp += ( maxid - minid + 1 );
2612 Safefree(trie->states[ state ].trans.list);
2615 DEBUG_TRIE_COMPILE_MORE_r(
2616 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2619 trie->states[ state ].trans.base=base;
2621 trie->lasttrans = tp + 1;
2625 Second Pass -- Flat Table Representation.
2627 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2628 each. We know that we will need Charcount+1 trans at most to store
2629 the data (one row per char at worst case) So we preallocate both
2630 structures assuming worst case.
2632 We then construct the trie using only the .next slots of the entry
2635 We use the .check field of the first entry of the node temporarily
2636 to make compression both faster and easier by keeping track of how
2637 many non zero fields are in the node.
2639 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2642 There are two terms at use here: state as a TRIE_NODEIDX() which is
2643 a number representing the first entry of the node, and state as a
2644 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2645 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2646 if there are 2 entrys per node. eg:
2654 The table is internally in the right hand, idx form. However as we
2655 also have to deal with the states array which is indexed by nodenum
2656 we have to use TRIE_NODENUM() to convert.
2659 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2660 "%*sCompiling trie using table compiler\n",
2661 (int)depth * 2 + 2, ""));
2663 trie->trans = (reg_trie_trans *)
2664 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2665 * trie->uniquecharcount + 1,
2666 sizeof(reg_trie_trans) );
2667 trie->states = (reg_trie_state *)
2668 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2669 sizeof(reg_trie_state) );
2670 next_alloc = trie->uniquecharcount + 1;
2673 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2675 regnode *noper = NEXTOPER( cur );
2676 const U8 *uc = (U8*)STRING( noper );
2677 const U8 *e = uc + STR_LEN( noper );
2679 U32 state = 1; /* required init */
2681 U16 charid = 0; /* sanity init */
2682 U32 accept_state = 0; /* sanity init */
2684 U32 wordlen = 0; /* required init */
2686 if (OP(noper) == NOTHING) {
2687 regnode *noper_next= regnext(noper);
2688 if (noper_next != tail && OP(noper_next) == flags) {
2690 uc= (U8*)STRING(noper);
2691 e= uc + STR_LEN(noper);
2695 if ( OP(noper) != NOTHING ) {
2696 for ( ; uc < e ; uc += len ) {
2701 charid = trie->charmap[ uvc ];
2703 SV* const * const svpp = hv_fetch( widecharmap,
2707 charid = svpp ? (U16)SvIV(*svpp) : 0;
2711 if ( !trie->trans[ state + charid ].next ) {
2712 trie->trans[ state + charid ].next = next_alloc;
2713 trie->trans[ state ].check++;
2714 prev_states[TRIE_NODENUM(next_alloc)]
2715 = TRIE_NODENUM(state);
2716 next_alloc += trie->uniquecharcount;
2718 state = trie->trans[ state + charid ].next;
2720 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2722 /* charid is now 0 if we dont know the char read, or
2723 * nonzero if we do */
2726 accept_state = TRIE_NODENUM( state );
2727 TRIE_HANDLE_WORD(accept_state);
2729 } /* end second pass */
2731 /* and now dump it out before we compress it */
2732 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2734 next_alloc, depth+1));
2738 * Inplace compress the table.*
2740 For sparse data sets the table constructed by the trie algorithm will
2741 be mostly 0/FAIL transitions or to put it another way mostly empty.
2742 (Note that leaf nodes will not contain any transitions.)
2744 This algorithm compresses the tables by eliminating most such
2745 transitions, at the cost of a modest bit of extra work during lookup:
2747 - Each states[] entry contains a .base field which indicates the
2748 index in the state[] array wheres its transition data is stored.
2750 - If .base is 0 there are no valid transitions from that node.
2752 - If .base is nonzero then charid is added to it to find an entry in
2755 -If trans[states[state].base+charid].check!=state then the
2756 transition is taken to be a 0/Fail transition. Thus if there are fail
2757 transitions at the front of the node then the .base offset will point
2758 somewhere inside the previous nodes data (or maybe even into a node
2759 even earlier), but the .check field determines if the transition is
2763 The following process inplace converts the table to the compressed
2764 table: We first do not compress the root node 1,and mark all its
2765 .check pointers as 1 and set its .base pointer as 1 as well. This
2766 allows us to do a DFA construction from the compressed table later,
2767 and ensures that any .base pointers we calculate later are greater
2770 - We set 'pos' to indicate the first entry of the second node.
2772 - We then iterate over the columns of the node, finding the first and
2773 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2774 and set the .check pointers accordingly, and advance pos
2775 appropriately and repreat for the next node. Note that when we copy
2776 the next pointers we have to convert them from the original
2777 NODEIDX form to NODENUM form as the former is not valid post
2780 - If a node has no transitions used we mark its base as 0 and do not
2781 advance the pos pointer.
2783 - If a node only has one transition we use a second pointer into the
2784 structure to fill in allocated fail transitions from other states.
2785 This pointer is independent of the main pointer and scans forward
2786 looking for null transitions that are allocated to a state. When it
2787 finds one it writes the single transition into the "hole". If the
2788 pointer doesnt find one the single transition is appended as normal.
2790 - Once compressed we can Renew/realloc the structures to release the
2793 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2794 specifically Fig 3.47 and the associated pseudocode.
2798 const U32 laststate = TRIE_NODENUM( next_alloc );
2801 trie->statecount = laststate;
2803 for ( state = 1 ; state < laststate ; state++ ) {
2805 const U32 stateidx = TRIE_NODEIDX( state );
2806 const U32 o_used = trie->trans[ stateidx ].check;
2807 U32 used = trie->trans[ stateidx ].check;
2808 trie->trans[ stateidx ].check = 0;
2811 used && charid < trie->uniquecharcount;
2814 if ( flag || trie->trans[ stateidx + charid ].next ) {
2815 if ( trie->trans[ stateidx + charid ].next ) {
2817 for ( ; zp < pos ; zp++ ) {
2818 if ( ! trie->trans[ zp ].next ) {
2822 trie->states[ state ].trans.base
2824 + trie->uniquecharcount
2826 trie->trans[ zp ].next
2827 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2829 trie->trans[ zp ].check = state;
2830 if ( ++zp > pos ) pos = zp;
2837 trie->states[ state ].trans.base
2838 = pos + trie->uniquecharcount - charid ;
2840 trie->trans[ pos ].next
2841 = SAFE_TRIE_NODENUM(
2842 trie->trans[ stateidx + charid ].next );
2843 trie->trans[ pos ].check = state;
2848 trie->lasttrans = pos + 1;
2849 trie->states = (reg_trie_state *)
2850 PerlMemShared_realloc( trie->states, laststate
2851 * sizeof(reg_trie_state) );
2852 DEBUG_TRIE_COMPILE_MORE_r(
2853 PerlIO_printf( Perl_debug_log,
2854 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2855 (int)depth * 2 + 2,"",
2856 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2860 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2863 } /* end table compress */
2865 DEBUG_TRIE_COMPILE_MORE_r(
2866 PerlIO_printf(Perl_debug_log,
2867 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2868 (int)depth * 2 + 2, "",
2869 (UV)trie->statecount,
2870 (UV)trie->lasttrans)
2872 /* resize the trans array to remove unused space */
2873 trie->trans = (reg_trie_trans *)
2874 PerlMemShared_realloc( trie->trans, trie->lasttrans
2875 * sizeof(reg_trie_trans) );
2877 { /* Modify the program and insert the new TRIE node */
2878 U8 nodetype =(U8)(flags & 0xFF);
2882 regnode *optimize = NULL;
2883 #ifdef RE_TRACK_PATTERN_OFFSETS
2886 U32 mjd_nodelen = 0;
2887 #endif /* RE_TRACK_PATTERN_OFFSETS */
2888 #endif /* DEBUGGING */
2890 This means we convert either the first branch or the first Exact,
2891 depending on whether the thing following (in 'last') is a branch
2892 or not and whther first is the startbranch (ie is it a sub part of
2893 the alternation or is it the whole thing.)
2894 Assuming its a sub part we convert the EXACT otherwise we convert
2895 the whole branch sequence, including the first.
2897 /* Find the node we are going to overwrite */
2898 if ( first != startbranch || OP( last ) == BRANCH ) {
2899 /* branch sub-chain */
2900 NEXT_OFF( first ) = (U16)(last - first);
2901 #ifdef RE_TRACK_PATTERN_OFFSETS
2903 mjd_offset= Node_Offset((convert));
2904 mjd_nodelen= Node_Length((convert));
2907 /* whole branch chain */
2909 #ifdef RE_TRACK_PATTERN_OFFSETS
2912 const regnode *nop = NEXTOPER( convert );
2913 mjd_offset= Node_Offset((nop));
2914 mjd_nodelen= Node_Length((nop));
2918 PerlIO_printf(Perl_debug_log,
2919 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2920 (int)depth * 2 + 2, "",
2921 (UV)mjd_offset, (UV)mjd_nodelen)
2924 /* But first we check to see if there is a common prefix we can
2925 split out as an EXACT and put in front of the TRIE node. */
2926 trie->startstate= 1;
2927 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2929 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2933 const U32 base = trie->states[ state ].trans.base;
2935 if ( trie->states[state].wordnum )
2938 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2939 if ( ( base + ofs >= trie->uniquecharcount ) &&
2940 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2941 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2943 if ( ++count > 1 ) {
2944 SV **tmp = av_fetch( revcharmap, ofs, 0);
2945 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2946 if ( state == 1 ) break;
2948 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2950 PerlIO_printf(Perl_debug_log,
2951 "%*sNew Start State=%"UVuf" Class: [",
2952 (int)depth * 2 + 2, "",
2955 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2956 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2958 TRIE_BITMAP_SET(trie,*ch);
2960 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2962 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2966 TRIE_BITMAP_SET(trie,*ch);
2968 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2969 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2975 SV **tmp = av_fetch( revcharmap, idx, 0);
2977 char *ch = SvPV( *tmp, len );
2979 SV *sv=sv_newmortal();
2980 PerlIO_printf( Perl_debug_log,
2981 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2982 (int)depth * 2 + 2, "",
2984 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2985 PL_colors[0], PL_colors[1],
2986 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2987 PERL_PV_ESCAPE_FIRSTCHAR
2992 OP( convert ) = nodetype;
2993 str=STRING(convert);
2996 STR_LEN(convert) += len;
3002 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
3007 trie->prefixlen = (state-1);
3009 regnode *n = convert+NODE_SZ_STR(convert);
3010 NEXT_OFF(convert) = NODE_SZ_STR(convert);
3011 trie->startstate = state;
3012 trie->minlen -= (state - 1);
3013 trie->maxlen -= (state - 1);
3015 /* At least the UNICOS C compiler choked on this
3016 * being argument to DEBUG_r(), so let's just have
3019 #ifdef PERL_EXT_RE_BUILD
3025 regnode *fix = convert;
3026 U32 word = trie->wordcount;
3028 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3029 while( ++fix < n ) {
3030 Set_Node_Offset_Length(fix, 0, 0);
3033 SV ** const tmp = av_fetch( trie_words, word, 0 );
3035 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3036 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3038 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3046 NEXT_OFF(convert) = (U16)(tail - convert);
3047 DEBUG_r(optimize= n);
3053 if ( trie->maxlen ) {
3054 NEXT_OFF( convert ) = (U16)(tail - convert);
3055 ARG_SET( convert, data_slot );
3056 /* Store the offset to the first unabsorbed branch in
3057 jump[0], which is otherwise unused by the jump logic.
3058 We use this when dumping a trie and during optimisation. */
3060 trie->jump[0] = (U16)(nextbranch - convert);
3062 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3063 * and there is a bitmap
3064 * and the first "jump target" node we found leaves enough room
3065 * then convert the TRIE node into a TRIEC node, with the bitmap
3066 * embedded inline in the opcode - this is hypothetically faster.
3068 if ( !trie->states[trie->startstate].wordnum
3070 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
3072 OP( convert ) = TRIEC;
3073 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
3074 PerlMemShared_free(trie->bitmap);
3077 OP( convert ) = TRIE;
3079 /* store the type in the flags */
3080 convert->flags = nodetype;
3084 + regarglen[ OP( convert ) ];
3086 /* XXX We really should free up the resource in trie now,
3087 as we won't use them - (which resources?) dmq */
3089 /* needed for dumping*/
3090 DEBUG_r(if (optimize) {
3091 regnode *opt = convert;
3093 while ( ++opt < optimize) {
3094 Set_Node_Offset_Length(opt,0,0);
3097 Try to clean up some of the debris left after the
3100 while( optimize < jumper ) {
3101 mjd_nodelen += Node_Length((optimize));
3102 OP( optimize ) = OPTIMIZED;
3103 Set_Node_Offset_Length(optimize,0,0);
3106 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
3108 } /* end node insert */
3109 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
3111 /* Finish populating the prev field of the wordinfo array. Walk back
3112 * from each accept state until we find another accept state, and if
3113 * so, point the first word's .prev field at the second word. If the
3114 * second already has a .prev field set, stop now. This will be the
3115 * case either if we've already processed that word's accept state,
3116 * or that state had multiple words, and the overspill words were
3117 * already linked up earlier.
3124 for (word=1; word <= trie->wordcount; word++) {
3126 if (trie->wordinfo[word].prev)
3128 state = trie->wordinfo[word].accept;
3130 state = prev_states[state];
3133 prev = trie->states[state].wordnum;
3137 trie->wordinfo[word].prev = prev;
3139 Safefree(prev_states);
3143 /* and now dump out the compressed format */
3144 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3146 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
3148 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3149 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3151 SvREFCNT_dec_NN(revcharmap);
3155 : trie->startstate>1
3161 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3163 /* The Trie is constructed and compressed now so we can build a fail array if
3166 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3168 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3172 We find the fail state for each state in the trie, this state is the longest
3173 proper suffix of the current state's 'word' that is also a proper prefix of
3174 another word in our trie. State 1 represents the word '' and is thus the
3175 default fail state. This allows the DFA not to have to restart after its
3176 tried and failed a word at a given point, it simply continues as though it
3177 had been matching the other word in the first place.
3179 'abcdgu'=~/abcdefg|cdgu/
3180 When we get to 'd' we are still matching the first word, we would encounter
3181 'g' which would fail, which would bring us to the state representing 'd' in
3182 the second word where we would try 'g' and succeed, proceeding to match
3185 /* add a fail transition */
3186 const U32 trie_offset = ARG(source);
3187 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3189 const U32 ucharcount = trie->uniquecharcount;
3190 const U32 numstates = trie->statecount;
3191 const U32 ubound = trie->lasttrans + ucharcount;
3195 U32 base = trie->states[ 1 ].trans.base;
3198 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3200 GET_RE_DEBUG_FLAGS_DECL;
3202 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3203 PERL_UNUSED_CONTEXT;
3205 PERL_UNUSED_ARG(depth);
3208 if ( OP(source) == TRIE ) {
3209 struct regnode_1 *op = (struct regnode_1 *)
3210 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3211 StructCopy(source,op,struct regnode_1);
3212 stclass = (regnode *)op;
3214 struct regnode_charclass *op = (struct regnode_charclass *)
3215 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3216 StructCopy(source,op,struct regnode_charclass);
3217 stclass = (regnode *)op;
3219 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
3221 ARG_SET( stclass, data_slot );
3222 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3223 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3224 aho->trie=trie_offset;
3225 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3226 Copy( trie->states, aho->states, numstates, reg_trie_state );
3227 Newxz( q, numstates, U32);
3228 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3231 /* initialize fail[0..1] to be 1 so that we always have
3232 a valid final fail state */
3233 fail[ 0 ] = fail[ 1 ] = 1;
3235 for ( charid = 0; charid < ucharcount ; charid++ ) {
3236 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3238 q[ q_write ] = newstate;
3239 /* set to point at the root */
3240 fail[ q[ q_write++ ] ]=1;
3243 while ( q_read < q_write) {
3244 const U32 cur = q[ q_read++ % numstates ];
3245 base = trie->states[ cur ].trans.base;
3247 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3248 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3250 U32 fail_state = cur;
3253 fail_state = fail[ fail_state ];
3254 fail_base = aho->states[ fail_state ].trans.base;
3255 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3257 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3258 fail[ ch_state ] = fail_state;
3259 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3261 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3263 q[ q_write++ % numstates] = ch_state;
3267 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3268 when we fail in state 1, this allows us to use the
3269 charclass scan to find a valid start char. This is based on the principle
3270 that theres a good chance the string being searched contains lots of stuff
3271 that cant be a start char.
3273 fail[ 0 ] = fail[ 1 ] = 0;
3274 DEBUG_TRIE_COMPILE_r({
3275 PerlIO_printf(Perl_debug_log,
3276 "%*sStclass Failtable (%"UVuf" states): 0",
3277 (int)(depth * 2), "", (UV)numstates
3279 for( q_read=1; q_read<numstates; q_read++ ) {
3280 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3282 PerlIO_printf(Perl_debug_log, "\n");
3285 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3290 #define DEBUG_PEEP(str,scan,depth) \
3291 DEBUG_OPTIMISE_r({if (scan){ \
3292 regnode *Next = regnext(scan); \
3293 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
3294 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
3295 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3296 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3297 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3298 PerlIO_printf(Perl_debug_log, "\n"); \
3301 /* The below joins as many adjacent EXACTish nodes as possible into a single
3302 * one. The regop may be changed if the node(s) contain certain sequences that
3303 * require special handling. The joining is only done if:
3304 * 1) there is room in the current conglomerated node to entirely contain the
3306 * 2) they are the exact same node type
3308 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3309 * these get optimized out
3311 * If a node is to match under /i (folded), the number of characters it matches
3312 * can be different than its character length if it contains a multi-character
3313 * fold. *min_subtract is set to the total delta number of characters of the
3316 * And *unfolded_multi_char is set to indicate whether or not the node contains
3317 * an unfolded multi-char fold. This happens when whether the fold is valid or
3318 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3319 * SMALL LETTER SHARP S, as only if the target string being matched against
3320 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3321 * folding rules depend on the locale in force at runtime. (Multi-char folds
3322 * whose components are all above the Latin1 range are not run-time locale
3323 * dependent, and have already been folded by the time this function is
3326 * This is as good a place as any to discuss the design of handling these
3327 * multi-character fold sequences. It's been wrong in Perl for a very long
3328 * time. There are three code points in Unicode whose multi-character folds
3329 * were long ago discovered to mess things up. The previous designs for
3330 * dealing with these involved assigning a special node for them. This
3331 * approach doesn't always work, as evidenced by this example:
3332 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3333 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3334 * would match just the \xDF, it won't be able to handle the case where a
3335 * successful match would have to cross the node's boundary. The new approach
3336 * that hopefully generally solves the problem generates an EXACTFU_SS node
3337 * that is "sss" in this case.
3339 * It turns out that there are problems with all multi-character folds, and not
3340 * just these three. Now the code is general, for all such cases. The
3341 * approach taken is:
3342 * 1) This routine examines each EXACTFish node that could contain multi-
3343 * character folded sequences. Since a single character can fold into
3344 * such a sequence, the minimum match length for this node is less than
3345 * the number of characters in the node. This routine returns in
3346 * *min_subtract how many characters to subtract from the the actual
3347 * length of the string to get a real minimum match length; it is 0 if
3348 * there are no multi-char foldeds. This delta is used by the caller to
3349 * adjust the min length of the match, and the delta between min and max,
3350 * so that the optimizer doesn't reject these possibilities based on size
3352 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3353 * is used for an EXACTFU node that contains at least one "ss" sequence in
3354 * it. For non-UTF-8 patterns and strings, this is the only case where
3355 * there is a possible fold length change. That means that a regular
3356 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3357 * with length changes, and so can be processed faster. regexec.c takes
3358 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3359 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3360 * known until runtime). This saves effort in regex matching. However,
3361 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3362 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3363 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3364 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3365 * possibilities for the non-UTF8 patterns are quite simple, except for
3366 * the sharp s. All the ones that don't involve a UTF-8 target string are
3367 * members of a fold-pair, and arrays are set up for all of them so that
3368 * the other member of the pair can be found quickly. Code elsewhere in
3369 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3370 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3371 * described in the next item.
3372 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3373 * validity of the fold won't be known until runtime, and so must remain
3374 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3375 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3376 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3377 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3378 * The reason this is a problem is that the optimizer part of regexec.c
3379 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3380 * that a character in the pattern corresponds to at most a single
3381 * character in the target string. (And I do mean character, and not byte
3382 * here, unlike other parts of the documentation that have never been
3383 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3384 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3385 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3386 * nodes, violate the assumption, and they are the only instances where it
3387 * is violated. I'm reluctant to try to change the assumption, as the
3388 * code involved is impenetrable to me (khw), so instead the code here
3389 * punts. This routine examines EXACTFL nodes, and (when the pattern
3390 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3391 * boolean indicating whether or not the node contains such a fold. When
3392 * it is true, the caller sets a flag that later causes the optimizer in
3393 * this file to not set values for the floating and fixed string lengths,
3394 * and thus avoids the optimizer code in regexec.c that makes the invalid
3395 * assumption. Thus, there is no optimization based on string lengths for
3396 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3397 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3398 * assumption is wrong only in these cases is that all other non-UTF-8
3399 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3400 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3401 * EXACTF nodes because we don't know at compile time if it actually
3402 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3403 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3404 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3405 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3406 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3407 * string would require the pattern to be forced into UTF-8, the overhead
3408 * of which we want to avoid. Similarly the unfolded multi-char folds in
3409 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3412 * Similarly, the code that generates tries doesn't currently handle
3413 * not-already-folded multi-char folds, and it looks like a pain to change
3414 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3415 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3416 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3417 * using /iaa matching will be doing so almost entirely with ASCII
3418 * strings, so this should rarely be encountered in practice */
3420 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3421 if (PL_regkind[OP(scan)] == EXACT) \
3422 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3425 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3426 UV *min_subtract, bool *unfolded_multi_char,
3427 U32 flags,regnode *val, U32 depth)
3429 /* Merge several consecutive EXACTish nodes into one. */
3430 regnode *n = regnext(scan);
3432 regnode *next = scan + NODE_SZ_STR(scan);
3436 regnode *stop = scan;
3437 GET_RE_DEBUG_FLAGS_DECL;
3439 PERL_UNUSED_ARG(depth);
3442 PERL_ARGS_ASSERT_JOIN_EXACT;
3443 #ifndef EXPERIMENTAL_INPLACESCAN
3444 PERL_UNUSED_ARG(flags);
3445 PERL_UNUSED_ARG(val);
3447 DEBUG_PEEP("join",scan,depth);
3449 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3450 * EXACT ones that are mergeable to the current one. */
3452 && (PL_regkind[OP(n)] == NOTHING
3453 || (stringok && OP(n) == OP(scan)))
3455 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3458 if (OP(n) == TAIL || n > next)
3460 if (PL_regkind[OP(n)] == NOTHING) {
3461 DEBUG_PEEP("skip:",n,depth);
3462 NEXT_OFF(scan) += NEXT_OFF(n);
3463 next = n + NODE_STEP_REGNODE;
3470 else if (stringok) {
3471 const unsigned int oldl = STR_LEN(scan);
3472 regnode * const nnext = regnext(n);
3474 /* XXX I (khw) kind of doubt that this works on platforms (should
3475 * Perl ever run on one) where U8_MAX is above 255 because of lots
3476 * of other assumptions */
3477 /* Don't join if the sum can't fit into a single node */
3478 if (oldl + STR_LEN(n) > U8_MAX)
3481 DEBUG_PEEP("merg",n,depth);
3484 NEXT_OFF(scan) += NEXT_OFF(n);
3485 STR_LEN(scan) += STR_LEN(n);
3486 next = n + NODE_SZ_STR(n);
3487 /* Now we can overwrite *n : */
3488 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3496 #ifdef EXPERIMENTAL_INPLACESCAN
3497 if (flags && !NEXT_OFF(n)) {
3498 DEBUG_PEEP("atch", val, depth);
3499 if (reg_off_by_arg[OP(n)]) {
3500 ARG_SET(n, val - n);
3503 NEXT_OFF(n) = val - n;
3511 *unfolded_multi_char = FALSE;
3513 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3514 * can now analyze for sequences of problematic code points. (Prior to
3515 * this final joining, sequences could have been split over boundaries, and
3516 * hence missed). The sequences only happen in folding, hence for any
3517 * non-EXACT EXACTish node */
3518 if (OP(scan) != EXACT && OP(scan) != EXACTL) {
3519 U8* s0 = (U8*) STRING(scan);
3521 U8* s_end = s0 + STR_LEN(scan);
3523 int total_count_delta = 0; /* Total delta number of characters that
3524 multi-char folds expand to */
3526 /* One pass is made over the node's string looking for all the
3527 * possibilities. To avoid some tests in the loop, there are two main
3528 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3533 if (OP(scan) == EXACTFL) {
3536 /* An EXACTFL node would already have been changed to another
3537 * node type unless there is at least one character in it that
3538 * is problematic; likely a character whose fold definition
3539 * won't be known until runtime, and so has yet to be folded.
3540 * For all but the UTF-8 locale, folds are 1-1 in length, but
3541 * to handle the UTF-8 case, we need to create a temporary
3542 * folded copy using UTF-8 locale rules in order to analyze it.
3543 * This is because our macros that look to see if a sequence is
3544 * a multi-char fold assume everything is folded (otherwise the
3545 * tests in those macros would be too complicated and slow).
3546 * Note that here, the non-problematic folds will have already
3547 * been done, so we can just copy such characters. We actually
3548 * don't completely fold the EXACTFL string. We skip the
3549 * unfolded multi-char folds, as that would just create work
3550 * below to figure out the size they already are */
3552 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3555 STRLEN s_len = UTF8SKIP(s);
3556 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3557 Copy(s, d, s_len, U8);
3560 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3561 *unfolded_multi_char = TRUE;
3562 Copy(s, d, s_len, U8);
3565 else if (isASCII(*s)) {
3566 *(d++) = toFOLD(*s);
3570 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3576 /* Point the remainder of the routine to look at our temporary
3580 } /* End of creating folded copy of EXACTFL string */
3582 /* Examine the string for a multi-character fold sequence. UTF-8
3583 * patterns have all characters pre-folded by the time this code is
3585 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3586 length sequence we are looking for is 2 */
3588 int count = 0; /* How many characters in a multi-char fold */
3589 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3590 if (! len) { /* Not a multi-char fold: get next char */
3595 /* Nodes with 'ss' require special handling, except for
3596 * EXACTFA-ish for which there is no multi-char fold to this */
3597 if (len == 2 && *s == 's' && *(s+1) == 's'
3598 && OP(scan) != EXACTFA
3599 && OP(scan) != EXACTFA_NO_TRIE)
3602 if (OP(scan) != EXACTFL) {
3603 OP(scan) = EXACTFU_SS;
3607 else { /* Here is a generic multi-char fold. */
3608 U8* multi_end = s + len;
3610 /* Count how many characters are in it. In the case of
3611 * /aa, no folds which contain ASCII code points are
3612 * allowed, so check for those, and skip if found. */
3613 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3614 count = utf8_length(s, multi_end);
3618 while (s < multi_end) {
3621 goto next_iteration;
3631 /* The delta is how long the sequence is minus 1 (1 is how long
3632 * the character that folds to the sequence is) */
3633 total_count_delta += count - 1;
3637 /* We created a temporary folded copy of the string in EXACTFL
3638 * nodes. Therefore we need to be sure it doesn't go below zero,
3639 * as the real string could be shorter */
3640 if (OP(scan) == EXACTFL) {
3641 int total_chars = utf8_length((U8*) STRING(scan),
3642 (U8*) STRING(scan) + STR_LEN(scan));
3643 if (total_count_delta > total_chars) {
3644 total_count_delta = total_chars;
3648 *min_subtract += total_count_delta;
3651 else if (OP(scan) == EXACTFA) {
3653 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3654 * fold to the ASCII range (and there are no existing ones in the
3655 * upper latin1 range). But, as outlined in the comments preceding
3656 * this function, we need to flag any occurrences of the sharp s.
3657 * This character forbids trie formation (because of added
3660 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3661 OP(scan) = EXACTFA_NO_TRIE;
3662 *unfolded_multi_char = TRUE;
3671 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3672 * folds that are all Latin1. As explained in the comments
3673 * preceding this function, we look also for the sharp s in EXACTF
3674 * and EXACTFL nodes; it can be in the final position. Otherwise
3675 * we can stop looking 1 byte earlier because have to find at least
3676 * two characters for a multi-fold */
3677 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3682 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3683 if (! len) { /* Not a multi-char fold. */
3684 if (*s == LATIN_SMALL_LETTER_SHARP_S
3685 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3687 *unfolded_multi_char = TRUE;
3694 && isALPHA_FOLD_EQ(*s, 's')
3695 && isALPHA_FOLD_EQ(*(s+1), 's'))
3698 /* EXACTF nodes need to know that the minimum length
3699 * changed so that a sharp s in the string can match this
3700 * ss in the pattern, but they remain EXACTF nodes, as they
3701 * won't match this unless the target string is is UTF-8,
3702 * which we don't know until runtime. EXACTFL nodes can't
3703 * transform into EXACTFU nodes */
3704 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3705 OP(scan) = EXACTFU_SS;
3709 *min_subtract += len - 1;
3716 /* Allow dumping but overwriting the collection of skipped
3717 * ops and/or strings with fake optimized ops */
3718 n = scan + NODE_SZ_STR(scan);
3726 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3730 /* REx optimizer. Converts nodes into quicker variants "in place".
3731 Finds fixed substrings. */
3733 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3734 to the position after last scanned or to NULL. */
3736 #define INIT_AND_WITHP \
3737 assert(!and_withp); \
3738 Newx(and_withp,1, regnode_ssc); \
3739 SAVEFREEPV(and_withp)
3743 S_unwind_scan_frames(pTHX_ const void *p)
3745 scan_frame *f= (scan_frame *)p;
3747 scan_frame *n= f->next_frame;
3755 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3756 SSize_t *minlenp, SSize_t *deltap,
3761 regnode_ssc *and_withp,
3762 U32 flags, U32 depth)
3763 /* scanp: Start here (read-write). */
3764 /* deltap: Write maxlen-minlen here. */
3765 /* last: Stop before this one. */
3766 /* data: string data about the pattern */
3767 /* stopparen: treat close N as END */
3768 /* recursed: which subroutines have we recursed into */
3769 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3771 /* There must be at least this number of characters to match */
3774 regnode *scan = *scanp, *next;
3776 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3777 int is_inf_internal = 0; /* The studied chunk is infinite */
3778 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3779 scan_data_t data_fake;
3780 SV *re_trie_maxbuff = NULL;
3781 regnode *first_non_open = scan;
3782 SSize_t stopmin = SSize_t_MAX;
3783 scan_frame *frame = NULL;
3784 GET_RE_DEBUG_FLAGS_DECL;
3786 PERL_ARGS_ASSERT_STUDY_CHUNK;
3790 while (first_non_open && OP(first_non_open) == OPEN)
3791 first_non_open=regnext(first_non_open);
3797 RExC_study_chunk_recursed_count++;
3799 DEBUG_OPTIMISE_MORE_r(
3801 PerlIO_printf(Perl_debug_log,
3802 "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
3803 (int)(depth*2), "", (long)stopparen,
3804 (unsigned long)RExC_study_chunk_recursed_count,
3805 (unsigned long)depth, (unsigned long)recursed_depth,
3808 if (recursed_depth) {
3811 for ( j = 0 ; j < recursed_depth ; j++ ) {
3812 for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
3814 PAREN_TEST(RExC_study_chunk_recursed +
3815 ( j * RExC_study_chunk_recursed_bytes), i )
3818 !PAREN_TEST(RExC_study_chunk_recursed +
3819 (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
3822 PerlIO_printf(Perl_debug_log," %d",(int)i);
3826 if ( j + 1 < recursed_depth ) {
3827 PerlIO_printf(Perl_debug_log, ",");
3831 PerlIO_printf(Perl_debug_log,"\n");
3834 while ( scan && OP(scan) != END && scan < last ){
3835 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3836 node length to get a real minimum (because
3837 the folded version may be shorter) */
3838 bool unfolded_multi_char = FALSE;
3839 /* Peephole optimizer: */
3840 DEBUG_STUDYDATA("Peep:", data, depth);
3841 DEBUG_PEEP("Peep", scan, depth);
3844 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3845 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3846 * by a different invocation of reg() -- Yves
3848 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3850 /* Follow the next-chain of the current node and optimize
3851 away all the NOTHINGs from it. */
3852 if (OP(scan) != CURLYX) {
3853 const int max = (reg_off_by_arg[OP(scan)]
3855 /* I32 may be smaller than U16 on CRAYs! */
3856 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3857 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3861 /* Skip NOTHING and LONGJMP. */
3862 while ((n = regnext(n))
3863 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3864 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3865 && off + noff < max)
3867 if (reg_off_by_arg[OP(scan)])
3870 NEXT_OFF(scan) = off;
3873 /* The principal pseudo-switch. Cannot be a switch, since we
3874 look into several different things. */
3875 if ( OP(scan) == DEFINEP ) {
3877 SSize_t deltanext = 0;
3878 SSize_t fake_last_close = 0;
3879 I32 f = SCF_IN_DEFINE;
3881 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3882 scan = regnext(scan);
3883 assert( OP(scan) == IFTHEN );
3884 DEBUG_PEEP("expect IFTHEN", scan, depth);
3886 data_fake.last_closep= &fake_last_close;
3888 next = regnext(scan);
3889 scan = NEXTOPER(NEXTOPER(scan));
3890 DEBUG_PEEP("scan", scan, depth);
3891 DEBUG_PEEP("next", next, depth);
3893 /* we suppose the run is continuous, last=next...
3894 * NOTE we dont use the return here! */
3895 (void)study_chunk(pRExC_state, &scan, &minlen,
3896 &deltanext, next, &data_fake, stopparen,
3897 recursed_depth, NULL, f, depth+1);
3902 OP(scan) == BRANCH ||
3903 OP(scan) == BRANCHJ ||
3906 next = regnext(scan);
3909 /* The op(next)==code check below is to see if we
3910 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
3911 * IFTHEN is special as it might not appear in pairs.
3912 * Not sure whether BRANCH-BRANCHJ is possible, regardless
3913 * we dont handle it cleanly. */
3914 if (OP(next) == code || code == IFTHEN) {
3915 /* NOTE - There is similar code to this block below for
3916 * handling TRIE nodes on a re-study. If you change stuff here
3917 * check there too. */
3918 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3920 regnode * const startbranch=scan;
3922 if (flags & SCF_DO_SUBSTR) {
3923 /* Cannot merge strings after this. */
3924 scan_commit(pRExC_state, data, minlenp, is_inf);
3927 if (flags & SCF_DO_STCLASS)
3928 ssc_init_zero(pRExC_state, &accum);
3930 while (OP(scan) == code) {
3931 SSize_t deltanext, minnext, fake;
3933 regnode_ssc this_class;
3935 DEBUG_PEEP("Branch", scan, depth);
3938 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3940 data_fake.whilem_c = data->whilem_c;
3941 data_fake.last_closep = data->last_closep;
3944 data_fake.last_closep = &fake;
3946 data_fake.pos_delta = delta;
3947 next = regnext(scan);
3949 scan = NEXTOPER(scan); /* everything */
3950 if (code != BRANCH) /* everything but BRANCH */
3951 scan = NEXTOPER(scan);
3953 if (flags & SCF_DO_STCLASS) {
3954 ssc_init(pRExC_state, &this_class);
3955 data_fake.start_class = &this_class;
3956 f = SCF_DO_STCLASS_AND;
3958 if (flags & SCF_WHILEM_VISITED_POS)
3959 f |= SCF_WHILEM_VISITED_POS;
3961 /* we suppose the run is continuous, last=next...*/
3962 minnext = study_chunk(pRExC_state, &scan, minlenp,
3963 &deltanext, next, &data_fake, stopparen,
3964 recursed_depth, NULL, f,depth+1);
3968 if (deltanext == SSize_t_MAX) {
3969 is_inf = is_inf_internal = 1;
3971 } else if (max1 < minnext + deltanext)
3972 max1 = minnext + deltanext;
3974 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3976 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3977 if ( stopmin > minnext)
3978 stopmin = min + min1;
3979 flags &= ~SCF_DO_SUBSTR;
3981 data->flags |= SCF_SEEN_ACCEPT;
3984 if (data_fake.flags & SF_HAS_EVAL)
3985 data->flags |= SF_HAS_EVAL;
3986 data->whilem_c = data_fake.whilem_c;
3988 if (flags & SCF_DO_STCLASS)
3989 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3991 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3993 if (flags & SCF_DO_SUBSTR) {
3994 data->pos_min += min1;
3995 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3996 data->pos_delta = SSize_t_MAX;
3998 data->pos_delta += max1 - min1;
3999 if (max1 != min1 || is_inf)
4000 data->longest = &(data->longest_float);
4003 if (delta == SSize_t_MAX
4004 || SSize_t_MAX - delta - (max1 - min1) < 0)
4005 delta = SSize_t_MAX;
4007 delta += max1 - min1;
4008 if (flags & SCF_DO_STCLASS_OR) {
4009 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
4011 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4012 flags &= ~SCF_DO_STCLASS;
4015 else if (flags & SCF_DO_STCLASS_AND) {
4017 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
4018 flags &= ~SCF_DO_STCLASS;
4021 /* Switch to OR mode: cache the old value of
4022 * data->start_class */
4024 StructCopy(data->start_class, and_withp, regnode_ssc);
4025 flags &= ~SCF_DO_STCLASS_AND;
4026 StructCopy(&accum, data->start_class, regnode_ssc);
4027 flags |= SCF_DO_STCLASS_OR;
4031 if (PERL_ENABLE_TRIE_OPTIMISATION &&
4032 OP( startbranch ) == BRANCH )
4036 Assuming this was/is a branch we are dealing with: 'scan'
4037 now points at the item that follows the branch sequence,
4038 whatever it is. We now start at the beginning of the
4039 sequence and look for subsequences of
4045 which would be constructed from a pattern like
4048 If we can find such a subsequence we need to turn the first
4049 element into a trie and then add the subsequent branch exact
4050 strings to the trie.
4054 1. patterns where the whole set of branches can be
4057 2. patterns where only a subset can be converted.
4059 In case 1 we can replace the whole set with a single regop
4060 for the trie. In case 2 we need to keep the start and end
4063 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
4064 becomes BRANCH TRIE; BRANCH X;
4066 There is an additional case, that being where there is a
4067 common prefix, which gets split out into an EXACT like node
4068 preceding the TRIE node.
4070 If x(1..n)==tail then we can do a simple trie, if not we make
4071 a "jump" trie, such that when we match the appropriate word
4072 we "jump" to the appropriate tail node. Essentially we turn
4073 a nested if into a case structure of sorts.
4078 if (!re_trie_maxbuff) {
4079 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
4080 if (!SvIOK(re_trie_maxbuff))
4081 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
4083 if ( SvIV(re_trie_maxbuff)>=0 ) {
4085 regnode *first = (regnode *)NULL;
4086 regnode *last = (regnode *)NULL;
4087 regnode *tail = scan;
4091 /* var tail is used because there may be a TAIL
4092 regop in the way. Ie, the exacts will point to the
4093 thing following the TAIL, but the last branch will
4094 point at the TAIL. So we advance tail. If we
4095 have nested (?:) we may have to move through several
4099 while ( OP( tail ) == TAIL ) {
4100 /* this is the TAIL generated by (?:) */
4101 tail = regnext( tail );
4105 DEBUG_TRIE_COMPILE_r({
4106 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
4107 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
4108 (int)depth * 2 + 2, "",
4109 "Looking for TRIE'able sequences. Tail node is: ",
4110 SvPV_nolen_const( RExC_mysv )
4116 Step through the branches
4117 cur represents each branch,
4118 noper is the first thing to be matched as part
4120 noper_next is the regnext() of that node.
4122 We normally handle a case like this
4123 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
4124 support building with NOJUMPTRIE, which restricts
4125 the trie logic to structures like /FOO|BAR/.
4127 If noper is a trieable nodetype then the branch is
4128 a possible optimization target. If we are building
4129 under NOJUMPTRIE then we require that noper_next is
4130 the same as scan (our current position in the regex
4133 Once we have two or more consecutive such branches
4134 we can create a trie of the EXACT's contents and
4135 stitch it in place into the program.
4137 If the sequence represents all of the branches in
4138 the alternation we replace the entire thing with a
4141 Otherwise when it is a subsequence we need to
4142 stitch it in place and replace only the relevant
4143 branches. This means the first branch has to remain
4144 as it is used by the alternation logic, and its
4145 next pointer, and needs to be repointed at the item
4146 on the branch chain following the last branch we
4147 have optimized away.
4149 This could be either a BRANCH, in which case the
4150 subsequence is internal, or it could be the item
4151 following the branch sequence in which case the
4152 subsequence is at the end (which does not
4153 necessarily mean the first node is the start of the
4156 TRIE_TYPE(X) is a define which maps the optype to a
4160 ----------------+-----------
4164 EXACTFU_SS | EXACTFU
4167 EXACTFLU8 | EXACTFLU8
4171 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
4173 : ( EXACT == (X) ) \
4175 : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
4177 : ( EXACTFA == (X) ) \
4179 : ( EXACTL == (X) ) \
4181 : ( EXACTFLU8 == (X) ) \
4185 /* dont use tail as the end marker for this traverse */
4186 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
4187 regnode * const noper = NEXTOPER( cur );
4188 U8 noper_type = OP( noper );
4189 U8 noper_trietype = TRIE_TYPE( noper_type );
4190 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
4191 regnode * const noper_next = regnext( noper );
4192 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
4193 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
4196 DEBUG_TRIE_COMPILE_r({
4197 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4198 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
4199 (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
4201 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
4202 PerlIO_printf( Perl_debug_log, " -> %s",
4203 SvPV_nolen_const(RExC_mysv));
4206 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
4207 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
4208 SvPV_nolen_const(RExC_mysv));
4210 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
4211 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
4212 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4216 /* Is noper a trieable nodetype that can be merged
4217 * with the current trie (if there is one)? */
4221 ( noper_trietype == NOTHING)
4222 || ( trietype == NOTHING )
4223 || ( trietype == noper_trietype )
4226 && noper_next == tail
4230 /* Handle mergable triable node Either we are
4231 * the first node in a new trieable sequence,
4232 * in which case we do some bookkeeping,
4233 * otherwise we update the end pointer. */
4236 if ( noper_trietype == NOTHING ) {
4237 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4238 regnode * const noper_next = regnext( noper );
4239 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4240 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4243 if ( noper_next_trietype ) {
4244 trietype = noper_next_trietype;
4245 } else if (noper_next_type) {
4246 /* a NOTHING regop is 1 regop wide.
4247 * We need at least two for a trie
4248 * so we can't merge this in */
4252 trietype = noper_trietype;
4255 if ( trietype == NOTHING )
4256 trietype = noper_trietype;
4261 } /* end handle mergable triable node */
4263 /* handle unmergable node -
4264 * noper may either be a triable node which can
4265 * not be tried together with the current trie,
4266 * or a non triable node */
4268 /* If last is set and trietype is not
4269 * NOTHING then we have found at least two
4270 * triable branch sequences in a row of a
4271 * similar trietype so we can turn them
4272 * into a trie. If/when we allow NOTHING to
4273 * start a trie sequence this condition
4274 * will be required, and it isn't expensive
4275 * so we leave it in for now. */
4276 if ( trietype && trietype != NOTHING )
4277 make_trie( pRExC_state,
4278 startbranch, first, cur, tail,
4279 count, trietype, depth+1 );
4280 last = NULL; /* note: we clear/update
4281 first, trietype etc below,
4282 so we dont do it here */
4286 && noper_next == tail
4289 /* noper is triable, so we can start a new
4293 trietype = noper_trietype;
4295 /* if we already saw a first but the
4296 * current node is not triable then we have
4297 * to reset the first information. */
4302 } /* end handle unmergable node */
4303 } /* loop over branches */
4304 DEBUG_TRIE_COMPILE_r({
4305 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4306 PerlIO_printf( Perl_debug_log,
4307 "%*s- %s (%d) <SCAN FINISHED>\n",
4309 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4312 if ( last && trietype ) {
4313 if ( trietype != NOTHING ) {
4314 /* the last branch of the sequence was part of
4315 * a trie, so we have to construct it here
4316 * outside of the loop */
4317 made= make_trie( pRExC_state, startbranch,
4318 first, scan, tail, count,
4319 trietype, depth+1 );
4320 #ifdef TRIE_STUDY_OPT
4321 if ( ((made == MADE_EXACT_TRIE &&
4322 startbranch == first)
4323 || ( first_non_open == first )) &&
4325 flags |= SCF_TRIE_RESTUDY;
4326 if ( startbranch == first
4329 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4334 /* at this point we know whatever we have is a
4335 * NOTHING sequence/branch AND if 'startbranch'
4336 * is 'first' then we can turn the whole thing
4339 if ( startbranch == first ) {
4341 /* the entire thing is a NOTHING sequence,
4342 * something like this: (?:|) So we can
4343 * turn it into a plain NOTHING op. */
4344 DEBUG_TRIE_COMPILE_r({
4345 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
4346 PerlIO_printf( Perl_debug_log,
4347 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4348 "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
4351 OP(startbranch)= NOTHING;
4352 NEXT_OFF(startbranch)= tail - startbranch;
4353 for ( opt= startbranch + 1; opt < tail ; opt++ )
4357 } /* end if ( last) */
4358 } /* TRIE_MAXBUF is non zero */
4363 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4364 scan = NEXTOPER(NEXTOPER(scan));
4365 } else /* single branch is optimized. */
4366 scan = NEXTOPER(scan);
4368 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4370 regnode *start = NULL;
4371 regnode *end = NULL;
4372 U32 my_recursed_depth= recursed_depth;
4375 if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
4376 /* Do setup, note this code has side effects beyond
4377 * the rest of this block. Specifically setting
4378 * RExC_recurse[] must happen at least once during
4380 if (OP(scan) == GOSUB) {
4382 RExC_recurse[ARG2L(scan)] = scan;
4383 start = RExC_open_parens[paren-1];
4384 end = RExC_close_parens[paren-1];
4386 start = RExC_rxi->program + 1;
4389 /* NOTE we MUST always execute the above code, even
4390 * if we do nothing with a GOSUB/GOSTART */
4392 ( flags & SCF_IN_DEFINE )
4395 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
4397 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
4400 /* no need to do anything here if we are in a define. */
4401 /* or we are after some kind of infinite construct
4402 * so we can skip recursing into this item.
4403 * Since it is infinite we will not change the maxlen
4404 * or delta, and if we miss something that might raise
4405 * the minlen it will merely pessimise a little.
4407 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
4408 * might result in a minlen of 1 and not of 4,
4409 * but this doesn't make us mismatch, just try a bit
4410 * harder than we should.
4412 scan= regnext(scan);
4419 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4421 /* it is quite possible that there are more efficient ways
4422 * to do this. We maintain a bitmap per level of recursion
4423 * of which patterns we have entered so we can detect if a
4424 * pattern creates a possible infinite loop. When we
4425 * recurse down a level we copy the previous levels bitmap
4426 * down. When we are at recursion level 0 we zero the top
4427 * level bitmap. It would be nice to implement a different
4428 * more efficient way of doing this. In particular the top
4429 * level bitmap may be unnecessary.
4431 if (!recursed_depth) {
4432 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4434 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4435 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4436 RExC_study_chunk_recursed_bytes, U8);
4438 /* we havent recursed into this paren yet, so recurse into it */
4439 DEBUG_STUDYDATA("set:", data,depth);
4440 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4441 my_recursed_depth= recursed_depth + 1;
4443 DEBUG_STUDYDATA("inf:", data,depth);
4444 /* some form of infinite recursion, assume infinite length
4446 if (flags & SCF_DO_SUBSTR) {
4447 scan_commit(pRExC_state, data, minlenp, is_inf);
4448 data->longest = &(data->longest_float);
4450 is_inf = is_inf_internal = 1;
4451 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4452 ssc_anything(data->start_class);
4453 flags &= ~SCF_DO_STCLASS;
4455 start= NULL; /* reset start so we dont recurse later on. */
4460 end = regnext(scan);
4463 scan_frame *newframe;
4465 if (!RExC_frame_last) {
4466 Newxz(newframe, 1, scan_frame);
4467 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
4468 RExC_frame_head= newframe;
4470 } else if (!RExC_frame_last->next_frame) {
4471 Newxz(newframe,1,scan_frame);
4472 RExC_frame_last->next_frame= newframe;
4473 newframe->prev_frame= RExC_frame_last;
4476 newframe= RExC_frame_last->next_frame;
4478 RExC_frame_last= newframe;
4480 newframe->next_regnode = regnext(scan);
4481 newframe->last_regnode = last;
4482 newframe->stopparen = stopparen;
4483 newframe->prev_recursed_depth = recursed_depth;
4484 newframe->this_prev_frame= frame;
4486 DEBUG_STUDYDATA("frame-new:",data,depth);
4487 DEBUG_PEEP("fnew", scan, depth);
4494 recursed_depth= my_recursed_depth;
4499 else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
4500 SSize_t l = STR_LEN(scan);
4503 const U8 * const s = (U8*)STRING(scan);
4504 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4505 l = utf8_length(s, s + l);
4507 uc = *((U8*)STRING(scan));
4510 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4511 /* The code below prefers earlier match for fixed
4512 offset, later match for variable offset. */
4513 if (data->last_end == -1) { /* Update the start info. */
4514 data->last_start_min = data->pos_min;
4515 data->last_start_max = is_inf
4516 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4518 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4520 SvUTF8_on(data->last_found);
4522 SV * const sv = data->last_found;
4523 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4524 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4525 if (mg && mg->mg_len >= 0)
4526 mg->mg_len += utf8_length((U8*)STRING(scan),
4527 (U8*)STRING(scan)+STR_LEN(scan));
4529 data->last_end = data->pos_min + l;
4530 data->pos_min += l; /* As in the first entry. */
4531 data->flags &= ~SF_BEFORE_EOL;
4534 /* ANDing the code point leaves at most it, and not in locale, and
4535 * can't match null string */
4536 if (flags & SCF_DO_STCLASS_AND) {
4537 ssc_cp_and(data->start_class, uc);
4538 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4539 ssc_clear_locale(data->start_class);
4541 else if (flags & SCF_DO_STCLASS_OR) {
4542 ssc_add_cp(data->start_class, uc);
4543 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4545 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4546 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4548 flags &= ~SCF_DO_STCLASS;
4550 else if (PL_regkind[OP(scan)] == EXACT) {
4551 /* But OP != EXACT!, so is EXACTFish */
4552 SSize_t l = STR_LEN(scan);
4553 const U8 * s = (U8*)STRING(scan);
4555 /* Search for fixed substrings supports EXACT only. */
4556 if (flags & SCF_DO_SUBSTR) {
4558 scan_commit(pRExC_state, data, minlenp, is_inf);
4561 l = utf8_length(s, s + l);
4563 if (unfolded_multi_char) {
4564 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4566 min += l - min_subtract;
4568 delta += min_subtract;
4569 if (flags & SCF_DO_SUBSTR) {
4570 data->pos_min += l - min_subtract;
4571 if (data->pos_min < 0) {
4574 data->pos_delta += min_subtract;
4576 data->longest = &(data->longest_float);
4580 if (flags & SCF_DO_STCLASS) {
4581 SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
4583 assert(EXACTF_invlist);
4584 if (flags & SCF_DO_STCLASS_AND) {
4585 if (OP(scan) != EXACTFL)
4586 ssc_clear_locale(data->start_class);
4587 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4588 ANYOF_POSIXL_ZERO(data->start_class);
4589 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4591 else { /* SCF_DO_STCLASS_OR */
4592 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4593 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4595 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4596 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4598 flags &= ~SCF_DO_STCLASS;
4599 SvREFCNT_dec(EXACTF_invlist);
4602 else if (REGNODE_VARIES(OP(scan))) {
4603 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4604 I32 fl = 0, f = flags;
4605 regnode * const oscan = scan;
4606 regnode_ssc this_class;
4607 regnode_ssc *oclass = NULL;
4608 I32 next_is_eval = 0;
4610 switch (PL_regkind[OP(scan)]) {
4611 case WHILEM: /* End of (?:...)* . */
4612 scan = NEXTOPER(scan);
4615 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4616 next = NEXTOPER(scan);
4617 if (OP(next) == EXACT
4618 || OP(next) == EXACTL
4619 || (flags & SCF_DO_STCLASS))
4622 maxcount = REG_INFTY;
4623 next = regnext(scan);
4624 scan = NEXTOPER(scan);
4628 if (flags & SCF_DO_SUBSTR)
4633 if (flags & SCF_DO_STCLASS) {
4635 maxcount = REG_INFTY;
4636 next = regnext(scan);
4637 scan = NEXTOPER(scan);
4640 if (flags & SCF_DO_SUBSTR) {
4641 scan_commit(pRExC_state, data, minlenp, is_inf);
4642 /* Cannot extend fixed substrings */
4643 data->longest = &(data->longest_float);
4645 is_inf = is_inf_internal = 1;
4646 scan = regnext(scan);
4647 goto optimize_curly_tail;
4649 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4650 && (scan->flags == stopparen))
4655 mincount = ARG1(scan);
4656 maxcount = ARG2(scan);
4658 next = regnext(scan);
4659 if (OP(scan) == CURLYX) {
4660 I32 lp = (data ? *(data->last_closep) : 0);
4661 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4663 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4664 next_is_eval = (OP(scan) == EVAL);
4666 if (flags & SCF_DO_SUBSTR) {
4668 scan_commit(pRExC_state, data, minlenp, is_inf);
4669 /* Cannot extend fixed substrings */
4670 pos_before = data->pos_min;
4674 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4676 data->flags |= SF_IS_INF;
4678 if (flags & SCF_DO_STCLASS) {
4679 ssc_init(pRExC_state, &this_class);
4680 oclass = data->start_class;
4681 data->start_class = &this_class;
4682 f |= SCF_DO_STCLASS_AND;
4683 f &= ~SCF_DO_STCLASS_OR;
4685 /* Exclude from super-linear cache processing any {n,m}
4686 regops for which the combination of input pos and regex
4687 pos is not enough information to determine if a match
4690 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4691 regex pos at the \s*, the prospects for a match depend not
4692 only on the input position but also on how many (bar\s*)
4693 repeats into the {4,8} we are. */
4694 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4695 f &= ~SCF_WHILEM_VISITED_POS;
4697 /* This will finish on WHILEM, setting scan, or on NULL: */
4698 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4699 last, data, stopparen, recursed_depth, NULL,
4701 ? (f & ~SCF_DO_SUBSTR)
4705 if (flags & SCF_DO_STCLASS)
4706 data->start_class = oclass;
4707 if (mincount == 0 || minnext == 0) {
4708 if (flags & SCF_DO_STCLASS_OR) {
4709 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4711 else if (flags & SCF_DO_STCLASS_AND) {
4712 /* Switch to OR mode: cache the old value of
4713 * data->start_class */
4715 StructCopy(data->start_class, and_withp, regnode_ssc);
4716 flags &= ~SCF_DO_STCLASS_AND;
4717 StructCopy(&this_class, data->start_class, regnode_ssc);
4718 flags |= SCF_DO_STCLASS_OR;
4719 ANYOF_FLAGS(data->start_class)
4720 |= SSC_MATCHES_EMPTY_STRING;
4722 } else { /* Non-zero len */
4723 if (flags & SCF_DO_STCLASS_OR) {
4724 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4725 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4727 else if (flags & SCF_DO_STCLASS_AND)
4728 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4729 flags &= ~SCF_DO_STCLASS;
4731 if (!scan) /* It was not CURLYX, but CURLY. */
4733 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4734 /* ? quantifier ok, except for (?{ ... }) */
4735 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4736 && (minnext == 0) && (deltanext == 0)
4737 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4738 && maxcount <= REG_INFTY/3) /* Complement check for big
4741 /* Fatal warnings may leak the regexp without this: */
4742 SAVEFREESV(RExC_rx_sv);
4743 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
4744 "Quantifier unexpected on zero-length expression "
4745 "in regex m/%"UTF8f"/",
4746 UTF8fARG(UTF, RExC_end - RExC_precomp,
4748 (void)ReREFCNT_inc(RExC_rx_sv);
4751 min += minnext * mincount;
4752 is_inf_internal |= deltanext == SSize_t_MAX
4753 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4754 is_inf |= is_inf_internal;
4756 delta = SSize_t_MAX;
4758 delta += (minnext + deltanext) * maxcount
4759 - minnext * mincount;
4761 /* Try powerful optimization CURLYX => CURLYN. */
4762 if ( OP(oscan) == CURLYX && data
4763 && data->flags & SF_IN_PAR
4764 && !(data->flags & SF_HAS_EVAL)
4765 && !deltanext && minnext == 1 ) {
4766 /* Try to optimize to CURLYN. */
4767 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4768 regnode * const nxt1 = nxt;
4775 if (!REGNODE_SIMPLE(OP(nxt))
4776 && !(PL_regkind[OP(nxt)] == EXACT
4777 && STR_LEN(nxt) == 1))
4783 if (OP(nxt) != CLOSE)
4785 if (RExC_open_parens) {
4786 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4787 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4789 /* Now we know that nxt2 is the only contents: */
4790 oscan->flags = (U8)ARG(nxt);
4792 OP(nxt1) = NOTHING; /* was OPEN. */
4795 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4796 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4797 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4798 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4799 OP(nxt + 1) = OPTIMIZED; /* was count. */
4800 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4805 /* Try optimization CURLYX => CURLYM. */
4806 if ( OP(oscan) == CURLYX && data
4807 && !(data->flags & SF_HAS_PAR)
4808 && !(data->flags & SF_HAS_EVAL)
4809 && !deltanext /* atom is fixed width */
4810 && minnext != 0 /* CURLYM can't handle zero width */
4812 /* Nor characters whose fold at run-time may be
4813 * multi-character */
4814 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4816 /* XXXX How to optimize if data == 0? */
4817 /* Optimize to a simpler form. */
4818 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4822 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4823 && (OP(nxt2) != WHILEM))
4825 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4826 /* Need to optimize away parenths. */
4827 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4828 /* Set the parenth number. */
4829 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4831 oscan->flags = (U8)ARG(nxt);
4832 if (RExC_open_parens) {
4833 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4834 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4836 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4837 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4840 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4841 OP(nxt + 1) = OPTIMIZED; /* was count. */
4842 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4843 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4846 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4847 regnode *nnxt = regnext(nxt1);
4849 if (reg_off_by_arg[OP(nxt1)])
4850 ARG_SET(nxt1, nxt2 - nxt1);
4851 else if (nxt2 - nxt1 < U16_MAX)
4852 NEXT_OFF(nxt1) = nxt2 - nxt1;
4854 OP(nxt) = NOTHING; /* Cannot beautify */
4859 /* Optimize again: */
4860 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4861 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4866 else if ((OP(oscan) == CURLYX)
4867 && (flags & SCF_WHILEM_VISITED_POS)
4868 /* See the comment on a similar expression above.
4869 However, this time it's not a subexpression
4870 we care about, but the expression itself. */
4871 && (maxcount == REG_INFTY)
4872 && data && ++data->whilem_c < 16) {
4873 /* This stays as CURLYX, we can put the count/of pair. */
4874 /* Find WHILEM (as in regexec.c) */
4875 regnode *nxt = oscan + NEXT_OFF(oscan);
4877 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4879 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4880 | (RExC_whilem_seen << 4)); /* On WHILEM */
4882 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4884 if (flags & SCF_DO_SUBSTR) {
4885 SV *last_str = NULL;
4886 STRLEN last_chrs = 0;
4887 int counted = mincount != 0;
4889 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4891 SSize_t b = pos_before >= data->last_start_min
4892 ? pos_before : data->last_start_min;
4894 const char * const s = SvPV_const(data->last_found, l);
4895 SSize_t old = b - data->last_start_min;
4898 old = utf8_hop((U8*)s, old) - (U8*)s;
4900 /* Get the added string: */
4901 last_str = newSVpvn_utf8(s + old, l, UTF);
4902 last_chrs = UTF ? utf8_length((U8*)(s + old),
4903 (U8*)(s + old + l)) : l;
4904 if (deltanext == 0 && pos_before == b) {
4905 /* What was added is a constant string */
4908 SvGROW(last_str, (mincount * l) + 1);
4909 repeatcpy(SvPVX(last_str) + l,
4910 SvPVX_const(last_str), l,
4912 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4913 /* Add additional parts. */
4914 SvCUR_set(data->last_found,
4915 SvCUR(data->last_found) - l);
4916 sv_catsv(data->last_found, last_str);
4918 SV * sv = data->last_found;
4920 SvUTF8(sv) && SvMAGICAL(sv) ?
4921 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4922 if (mg && mg->mg_len >= 0)
4923 mg->mg_len += last_chrs * (mincount-1);
4925 last_chrs *= mincount;
4926 data->last_end += l * (mincount - 1);
4929 /* start offset must point into the last copy */
4930 data->last_start_min += minnext * (mincount - 1);
4931 data->last_start_max =
4934 : data->last_start_max +
4935 (maxcount - 1) * (minnext + data->pos_delta);
4938 /* It is counted once already... */
4939 data->pos_min += minnext * (mincount - counted);
4941 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4942 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4943 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4944 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4946 if (deltanext != SSize_t_MAX)
4947 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4948 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4949 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4951 if (deltanext == SSize_t_MAX
4952 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4953 data->pos_delta = SSize_t_MAX;
4955 data->pos_delta += - counted * deltanext +
4956 (minnext + deltanext) * maxcount - minnext * mincount;
4957 if (mincount != maxcount) {
4958 /* Cannot extend fixed substrings found inside
4960 scan_commit(pRExC_state, data, minlenp, is_inf);
4961 if (mincount && last_str) {
4962 SV * const sv = data->last_found;
4963 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4964 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4968 sv_setsv(sv, last_str);
4969 data->last_end = data->pos_min;
4970 data->last_start_min = data->pos_min - last_chrs;
4971 data->last_start_max = is_inf
4973 : data->pos_min + data->pos_delta - last_chrs;
4975 data->longest = &(data->longest_float);
4977 SvREFCNT_dec(last_str);
4979 if (data && (fl & SF_HAS_EVAL))
4980 data->flags |= SF_HAS_EVAL;
4981 optimize_curly_tail:
4982 if (OP(oscan) != CURLYX) {
4983 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4985 NEXT_OFF(oscan) += NEXT_OFF(next);
4991 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4996 if (flags & SCF_DO_SUBSTR) {
4997 /* Cannot expect anything... */
4998 scan_commit(pRExC_state, data, minlenp, is_inf);
4999 data->longest = &(data->longest_float);
5001 is_inf = is_inf_internal = 1;
5002 if (flags & SCF_DO_STCLASS_OR) {
5003 if (OP(scan) == CLUMP) {
5004 /* Actually is any start char, but very few code points
5005 * aren't start characters */
5006 ssc_match_all_cp(data->start_class);
5009 ssc_anything(data->start_class);
5012 flags &= ~SCF_DO_STCLASS;
5016 else if (OP(scan) == LNBREAK) {
5017 if (flags & SCF_DO_STCLASS) {
5018 if (flags & SCF_DO_STCLASS_AND) {
5019 ssc_intersection(data->start_class,
5020 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
5021 ssc_clear_locale(data->start_class);
5022 ANYOF_FLAGS(data->start_class)
5023 &= ~SSC_MATCHES_EMPTY_STRING;
5025 else if (flags & SCF_DO_STCLASS_OR) {
5026 ssc_union(data->start_class,
5027 PL_XPosix_ptrs[_CC_VERTSPACE],
5029 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5031 /* See commit msg for
5032 * 749e076fceedeb708a624933726e7989f2302f6a */
5033 ANYOF_FLAGS(data->start_class)
5034 &= ~SSC_MATCHES_EMPTY_STRING;
5036 flags &= ~SCF_DO_STCLASS;
5039 if (delta != SSize_t_MAX)
5040 delta++; /* Because of the 2 char string cr-lf */
5041 if (flags & SCF_DO_SUBSTR) {
5042 /* Cannot expect anything... */
5043 scan_commit(pRExC_state, data, minlenp, is_inf);
5045 data->pos_delta += 1;
5046 data->longest = &(data->longest_float);
5049 else if (REGNODE_SIMPLE(OP(scan))) {
5051 if (flags & SCF_DO_SUBSTR) {
5052 scan_commit(pRExC_state, data, minlenp, is_inf);
5056 if (flags & SCF_DO_STCLASS) {
5058 SV* my_invlist = NULL;
5061 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
5062 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
5064 /* Some of the logic below assumes that switching
5065 locale on will only add false positives. */
5070 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
5074 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5075 ssc_match_all_cp(data->start_class);
5080 SV* REG_ANY_invlist = _new_invlist(2);
5081 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
5083 if (flags & SCF_DO_STCLASS_OR) {
5084 ssc_union(data->start_class,
5086 TRUE /* TRUE => invert, hence all but \n
5090 else if (flags & SCF_DO_STCLASS_AND) {
5091 ssc_intersection(data->start_class,
5093 TRUE /* TRUE => invert */
5095 ssc_clear_locale(data->start_class);
5097 SvREFCNT_dec_NN(REG_ANY_invlist);
5103 if (flags & SCF_DO_STCLASS_AND)
5104 ssc_and(pRExC_state, data->start_class,
5105 (regnode_charclass *) scan);
5107 ssc_or(pRExC_state, data->start_class,
5108 (regnode_charclass *) scan);
5116 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
5117 if (flags & SCF_DO_STCLASS_AND) {
5118 bool was_there = cBOOL(
5119 ANYOF_POSIXL_TEST(data->start_class,
5121 ANYOF_POSIXL_ZERO(data->start_class);
5122 if (was_there) { /* Do an AND */
5123 ANYOF_POSIXL_SET(data->start_class, namedclass);
5125 /* No individual code points can now match */
5126 data->start_class->invlist
5127 = sv_2mortal(_new_invlist(0));
5130 int complement = namedclass + ((invert) ? -1 : 1);
5132 assert(flags & SCF_DO_STCLASS_OR);
5134 /* If the complement of this class was already there,
5135 * the result is that they match all code points,
5136 * (\d + \D == everything). Remove the classes from
5137 * future consideration. Locale is not relevant in
5139 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
5140 ssc_match_all_cp(data->start_class);
5141 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
5142 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5144 else { /* The usual case; just add this class to the
5146 ANYOF_POSIXL_SET(data->start_class, namedclass);
5151 case NPOSIXA: /* For these, we always know the exact set of
5156 if (FLAGS(scan) == _CC_ASCII) {
5157 my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
5160 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5161 PL_XPosix_ptrs[_CC_ASCII],
5172 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5174 /* NPOSIXD matches all upper Latin1 code points unless the
5175 * target string being matched is UTF-8, which is
5176 * unknowable until match time. Since we are going to
5177 * invert, we want to get rid of all of them so that the
5178 * inversion will match all */
5179 if (OP(scan) == NPOSIXD) {
5180 _invlist_subtract(my_invlist, PL_UpperLatin1,
5186 if (flags & SCF_DO_STCLASS_AND) {
5187 ssc_intersection(data->start_class, my_invlist, invert);
5188 ssc_clear_locale(data->start_class);
5191 assert(flags & SCF_DO_STCLASS_OR);
5192 ssc_union(data->start_class, my_invlist, invert);
5194 SvREFCNT_dec(my_invlist);
5196 if (flags & SCF_DO_STCLASS_OR)
5197 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5198 flags &= ~SCF_DO_STCLASS;
5201 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5202 data->flags |= (OP(scan) == MEOL
5205 scan_commit(pRExC_state, data, minlenp, is_inf);
5208 else if ( PL_regkind[OP(scan)] == BRANCHJ
5209 /* Lookbehind, or need to calculate parens/evals/stclass: */
5210 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5211 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5213 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5214 || OP(scan) == UNLESSM )
5216 /* Negative Lookahead/lookbehind
5217 In this case we can't do fixed string optimisation.
5220 SSize_t deltanext, minnext, fake = 0;
5225 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5227 data_fake.whilem_c = data->whilem_c;
5228 data_fake.last_closep = data->last_closep;
5231 data_fake.last_closep = &fake;
5232 data_fake.pos_delta = delta;
5233 if ( flags & SCF_DO_STCLASS && !scan->flags
5234 && OP(scan) == IFMATCH ) { /* Lookahead */
5235 ssc_init(pRExC_state, &intrnl);
5236 data_fake.start_class = &intrnl;
5237 f |= SCF_DO_STCLASS_AND;
5239 if (flags & SCF_WHILEM_VISITED_POS)
5240 f |= SCF_WHILEM_VISITED_POS;
5241 next = regnext(scan);
5242 nscan = NEXTOPER(NEXTOPER(scan));
5243 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5244 last, &data_fake, stopparen,
5245 recursed_depth, NULL, f, depth+1);
5248 FAIL("Variable length lookbehind not implemented");
5250 else if (minnext > (I32)U8_MAX) {
5251 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5254 scan->flags = (U8)minnext;
5257 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5259 if (data_fake.flags & SF_HAS_EVAL)
5260 data->flags |= SF_HAS_EVAL;
5261 data->whilem_c = data_fake.whilem_c;
5263 if (f & SCF_DO_STCLASS_AND) {
5264 if (flags & SCF_DO_STCLASS_OR) {
5265 /* OR before, AND after: ideally we would recurse with
5266 * data_fake to get the AND applied by study of the
5267 * remainder of the pattern, and then derecurse;
5268 * *** HACK *** for now just treat as "no information".
5269 * See [perl #56690].
5271 ssc_init(pRExC_state, data->start_class);
5273 /* AND before and after: combine and continue. These
5274 * assertions are zero-length, so can match an EMPTY
5276 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5277 ANYOF_FLAGS(data->start_class)
5278 |= SSC_MATCHES_EMPTY_STRING;
5282 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5284 /* Positive Lookahead/lookbehind
5285 In this case we can do fixed string optimisation,
5286 but we must be careful about it. Note in the case of
5287 lookbehind the positions will be offset by the minimum
5288 length of the pattern, something we won't know about
5289 until after the recurse.
5291 SSize_t deltanext, fake = 0;
5295 /* We use SAVEFREEPV so that when the full compile
5296 is finished perl will clean up the allocated
5297 minlens when it's all done. This way we don't
5298 have to worry about freeing them when we know
5299 they wont be used, which would be a pain.
5302 Newx( minnextp, 1, SSize_t );
5303 SAVEFREEPV(minnextp);
5306 StructCopy(data, &data_fake, scan_data_t);
5307 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5310 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5311 data_fake.last_found=newSVsv(data->last_found);
5315 data_fake.last_closep = &fake;
5316 data_fake.flags = 0;
5317 data_fake.pos_delta = delta;
5319 data_fake.flags |= SF_IS_INF;
5320 if ( flags & SCF_DO_STCLASS && !scan->flags
5321 && OP(scan) == IFMATCH ) { /* Lookahead */
5322 ssc_init(pRExC_state, &intrnl);
5323 data_fake.start_class = &intrnl;
5324 f |= SCF_DO_STCLASS_AND;
5326 if (flags & SCF_WHILEM_VISITED_POS)
5327 f |= SCF_WHILEM_VISITED_POS;
5328 next = regnext(scan);
5329 nscan = NEXTOPER(NEXTOPER(scan));
5331 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5332 &deltanext, last, &data_fake,
5333 stopparen, recursed_depth, NULL,
5337 FAIL("Variable length lookbehind not implemented");
5339 else if (*minnextp > (I32)U8_MAX) {
5340 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5343 scan->flags = (U8)*minnextp;
5348 if (f & SCF_DO_STCLASS_AND) {
5349 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5350 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5353 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5355 if (data_fake.flags & SF_HAS_EVAL)
5356 data->flags |= SF_HAS_EVAL;
5357 data->whilem_c = data_fake.whilem_c;
5358 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5359 if (RExC_rx->minlen<*minnextp)
5360 RExC_rx->minlen=*minnextp;
5361 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5362 SvREFCNT_dec_NN(data_fake.last_found);
5364 if ( data_fake.minlen_fixed != minlenp )
5366 data->offset_fixed= data_fake.offset_fixed;
5367 data->minlen_fixed= data_fake.minlen_fixed;
5368 data->lookbehind_fixed+= scan->flags;
5370 if ( data_fake.minlen_float != minlenp )
5372 data->minlen_float= data_fake.minlen_float;
5373 data->offset_float_min=data_fake.offset_float_min;
5374 data->offset_float_max=data_fake.offset_float_max;
5375 data->lookbehind_float+= scan->flags;
5382 else if (OP(scan) == OPEN) {
5383 if (stopparen != (I32)ARG(scan))
5386 else if (OP(scan) == CLOSE) {
5387 if (stopparen == (I32)ARG(scan)) {
5390 if ((I32)ARG(scan) == is_par) {
5391 next = regnext(scan);
5393 if ( next && (OP(next) != WHILEM) && next < last)
5394 is_par = 0; /* Disable optimization */
5397 *(data->last_closep) = ARG(scan);
5399 else if (OP(scan) == EVAL) {
5401 data->flags |= SF_HAS_EVAL;
5403 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5404 if (flags & SCF_DO_SUBSTR) {
5405 scan_commit(pRExC_state, data, minlenp, is_inf);
5406 flags &= ~SCF_DO_SUBSTR;
5408 if (data && OP(scan)==ACCEPT) {
5409 data->flags |= SCF_SEEN_ACCEPT;
5414 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5416 if (flags & SCF_DO_SUBSTR) {
5417 scan_commit(pRExC_state, data, minlenp, is_inf);
5418 data->longest = &(data->longest_float);
5420 is_inf = is_inf_internal = 1;
5421 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5422 ssc_anything(data->start_class);
5423 flags &= ~SCF_DO_STCLASS;
5425 else if (OP(scan) == GPOS) {
5426 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5427 !(delta || is_inf || (data && data->pos_delta)))
5429 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5430 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5431 if (RExC_rx->gofs < (STRLEN)min)
5432 RExC_rx->gofs = min;
5434 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5438 #ifdef TRIE_STUDY_OPT
5439 #ifdef FULL_TRIE_STUDY
5440 else if (PL_regkind[OP(scan)] == TRIE) {
5441 /* NOTE - There is similar code to this block above for handling
5442 BRANCH nodes on the initial study. If you change stuff here
5444 regnode *trie_node= scan;
5445 regnode *tail= regnext(scan);
5446 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5447 SSize_t max1 = 0, min1 = SSize_t_MAX;
5450 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5451 /* Cannot merge strings after this. */
5452 scan_commit(pRExC_state, data, minlenp, is_inf);
5454 if (flags & SCF_DO_STCLASS)
5455 ssc_init_zero(pRExC_state, &accum);
5461 const regnode *nextbranch= NULL;
5464 for ( word=1 ; word <= trie->wordcount ; word++)
5466 SSize_t deltanext=0, minnext=0, f = 0, fake;
5467 regnode_ssc this_class;
5469 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
5471 data_fake.whilem_c = data->whilem_c;
5472 data_fake.last_closep = data->last_closep;
5475 data_fake.last_closep = &fake;
5476 data_fake.pos_delta = delta;
5477 if (flags & SCF_DO_STCLASS) {
5478 ssc_init(pRExC_state, &this_class);
5479 data_fake.start_class = &this_class;
5480 f = SCF_DO_STCLASS_AND;
5482 if (flags & SCF_WHILEM_VISITED_POS)
5483 f |= SCF_WHILEM_VISITED_POS;
5485 if (trie->jump[word]) {
5487 nextbranch = trie_node + trie->jump[0];
5488 scan= trie_node + trie->jump[word];
5489 /* We go from the jump point to the branch that follows
5490 it. Note this means we need the vestigal unused
5491 branches even though they arent otherwise used. */
5492 minnext = study_chunk(pRExC_state, &scan, minlenp,
5493 &deltanext, (regnode *)nextbranch, &data_fake,
5494 stopparen, recursed_depth, NULL, f,depth+1);
5496 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5497 nextbranch= regnext((regnode*)nextbranch);
5499 if (min1 > (SSize_t)(minnext + trie->minlen))
5500 min1 = minnext + trie->minlen;
5501 if (deltanext == SSize_t_MAX) {
5502 is_inf = is_inf_internal = 1;
5504 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5505 max1 = minnext + deltanext + trie->maxlen;
5507 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5509 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5510 if ( stopmin > min + min1)
5511 stopmin = min + min1;
5512 flags &= ~SCF_DO_SUBSTR;
5514 data->flags |= SCF_SEEN_ACCEPT;
5517 if (data_fake.flags & SF_HAS_EVAL)
5518 data->flags |= SF_HAS_EVAL;
5519 data->whilem_c = data_fake.whilem_c;
5521 if (flags & SCF_DO_STCLASS)
5522 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5525 if (flags & SCF_DO_SUBSTR) {
5526 data->pos_min += min1;
5527 data->pos_delta += max1 - min1;
5528 if (max1 != min1 || is_inf)
5529 data->longest = &(data->longest_float);
5532 if (delta != SSize_t_MAX)
5533 delta += max1 - min1;
5534 if (flags & SCF_DO_STCLASS_OR) {
5535 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5537 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5538 flags &= ~SCF_DO_STCLASS;
5541 else if (flags & SCF_DO_STCLASS_AND) {
5543 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5544 flags &= ~SCF_DO_STCLASS;
5547 /* Switch to OR mode: cache the old value of
5548 * data->start_class */
5550 StructCopy(data->start_class, and_withp, regnode_ssc);
5551 flags &= ~SCF_DO_STCLASS_AND;
5552 StructCopy(&accum, data->start_class, regnode_ssc);
5553 flags |= SCF_DO_STCLASS_OR;
5560 else if (PL_regkind[OP(scan)] == TRIE) {
5561 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5564 min += trie->minlen;
5565 delta += (trie->maxlen - trie->minlen);
5566 flags &= ~SCF_DO_STCLASS; /* xxx */
5567 if (flags & SCF_DO_SUBSTR) {
5568 /* Cannot expect anything... */
5569 scan_commit(pRExC_state, data, minlenp, is_inf);
5570 data->pos_min += trie->minlen;
5571 data->pos_delta += (trie->maxlen - trie->minlen);
5572 if (trie->maxlen != trie->minlen)
5573 data->longest = &(data->longest_float);
5575 if (trie->jump) /* no more substrings -- for now /grr*/
5576 flags &= ~SCF_DO_SUBSTR;
5578 #endif /* old or new */
5579 #endif /* TRIE_STUDY_OPT */
5581 /* Else: zero-length, ignore. */
5582 scan = regnext(scan);
5584 /* If we are exiting a recursion we can unset its recursed bit
5585 * and allow ourselves to enter it again - no danger of an
5586 * infinite loop there.
5587 if (stopparen > -1 && recursed) {
5588 DEBUG_STUDYDATA("unset:", data,depth);
5589 PAREN_UNSET( recursed, stopparen);
5595 DEBUG_STUDYDATA("frame-end:",data,depth);
5596 DEBUG_PEEP("fend", scan, depth);
5598 /* restore previous context */
5599 last = frame->last_regnode;
5600 scan = frame->next_regnode;
5601 stopparen = frame->stopparen;
5602 recursed_depth = frame->prev_recursed_depth;
5604 RExC_frame_last = frame->prev_frame;
5605 frame = frame->this_prev_frame;
5606 goto fake_study_recurse;
5611 DEBUG_STUDYDATA("pre-fin:",data,depth);
5614 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5616 if (flags & SCF_DO_SUBSTR && is_inf)
5617 data->pos_delta = SSize_t_MAX - data->pos_min;
5618 if (is_par > (I32)U8_MAX)
5620 if (is_par && pars==1 && data) {
5621 data->flags |= SF_IN_PAR;
5622 data->flags &= ~SF_HAS_PAR;
5624 else if (pars && data) {
5625 data->flags |= SF_HAS_PAR;
5626 data->flags &= ~SF_IN_PAR;
5628 if (flags & SCF_DO_STCLASS_OR)
5629 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5630 if (flags & SCF_TRIE_RESTUDY)
5631 data->flags |= SCF_TRIE_RESTUDY;
5633 DEBUG_STUDYDATA("post-fin:",data,depth);
5636 SSize_t final_minlen= min < stopmin ? min : stopmin;
5638 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
5639 if (final_minlen > SSize_t_MAX - delta)
5640 RExC_maxlen = SSize_t_MAX;
5641 else if (RExC_maxlen < final_minlen + delta)
5642 RExC_maxlen = final_minlen + delta;
5644 return final_minlen;
5646 NOT_REACHED; /* NOTREACHED */
5650 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5652 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5654 PERL_ARGS_ASSERT_ADD_DATA;
5656 Renewc(RExC_rxi->data,
5657 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5658 char, struct reg_data);
5660 Renew(RExC_rxi->data->what, count + n, U8);
5662 Newx(RExC_rxi->data->what, n, U8);
5663 RExC_rxi->data->count = count + n;
5664 Copy(s, RExC_rxi->data->what + count, n, U8);
5668 /*XXX: todo make this not included in a non debugging perl, but appears to be
5669 * used anyway there, in 'use re' */
5670 #ifndef PERL_IN_XSUB_RE
5672 Perl_reginitcolors(pTHX)
5674 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5676 char *t = savepv(s);
5680 t = strchr(t, '\t');
5686 PL_colors[i] = t = (char *)"";
5691 PL_colors[i++] = (char *)"";
5698 #ifdef TRIE_STUDY_OPT
5699 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5702 (data.flags & SCF_TRIE_RESTUDY) \
5710 #define CHECK_RESTUDY_GOTO_butfirst
5714 * pregcomp - compile a regular expression into internal code
5716 * Decides which engine's compiler to call based on the hint currently in
5720 #ifndef PERL_IN_XSUB_RE
5722 /* return the currently in-scope regex engine (or the default if none) */
5724 regexp_engine const *
5725 Perl_current_re_engine(pTHX)
5727 if (IN_PERL_COMPILETIME) {
5728 HV * const table = GvHV(PL_hintgv);
5731 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5732 return &reh_regexp_engine;
5733 ptr = hv_fetchs(table, "regcomp", FALSE);
5734 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5735 return &reh_regexp_engine;
5736 return INT2PTR(regexp_engine*,SvIV(*ptr));
5740 if (!PL_curcop->cop_hints_hash)
5741 return &reh_regexp_engine;
5742 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5743 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5744 return &reh_regexp_engine;
5745 return INT2PTR(regexp_engine*,SvIV(ptr));
5751 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5753 regexp_engine const *eng = current_re_engine();
5754 GET_RE_DEBUG_FLAGS_DECL;
5756 PERL_ARGS_ASSERT_PREGCOMP;
5758 /* Dispatch a request to compile a regexp to correct regexp engine. */
5760 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5763 return CALLREGCOMP_ENG(eng, pattern, flags);
5767 /* public(ish) entry point for the perl core's own regex compiling code.
5768 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5769 * pattern rather than a list of OPs, and uses the internal engine rather
5770 * than the current one */
5773 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5775 SV *pat = pattern; /* defeat constness! */
5776 PERL_ARGS_ASSERT_RE_COMPILE;
5777 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5778 #ifdef PERL_IN_XSUB_RE
5783 NULL, NULL, rx_flags, 0);
5787 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5788 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5789 * point to the realloced string and length.
5791 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5795 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5796 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5798 U8 *const src = (U8*)*pat_p;
5803 GET_RE_DEBUG_FLAGS_DECL;
5805 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5806 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5808 Newx(dst, *plen_p * 2 + 1, U8);
5811 while (s < *plen_p) {
5812 append_utf8_from_native_byte(src[s], &d);
5813 if (n < num_code_blocks) {
5814 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5815 pRExC_state->code_blocks[n].start = d - dst - 1;
5816 assert(*(d - 1) == '(');
5819 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5820 pRExC_state->code_blocks[n].end = d - dst - 1;
5821 assert(*(d - 1) == ')');
5830 *pat_p = (char*) dst;
5832 RExC_orig_utf8 = RExC_utf8 = 1;
5837 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5838 * while recording any code block indices, and handling overloading,
5839 * nested qr// objects etc. If pat is null, it will allocate a new
5840 * string, or just return the first arg, if there's only one.
5842 * Returns the malloced/updated pat.
5843 * patternp and pat_count is the array of SVs to be concatted;
5844 * oplist is the optional list of ops that generated the SVs;
5845 * recompile_p is a pointer to a boolean that will be set if
5846 * the regex will need to be recompiled.
5847 * delim, if non-null is an SV that will be inserted between each element
5851 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5852 SV *pat, SV ** const patternp, int pat_count,
5853 OP *oplist, bool *recompile_p, SV *delim)
5857 bool use_delim = FALSE;
5858 bool alloced = FALSE;
5860 /* if we know we have at least two args, create an empty string,
5861 * then concatenate args to that. For no args, return an empty string */
5862 if (!pat && pat_count != 1) {
5868 for (svp = patternp; svp < patternp + pat_count; svp++) {
5871 STRLEN orig_patlen = 0;
5873 SV *msv = use_delim ? delim : *svp;
5874 if (!msv) msv = &PL_sv_undef;
5876 /* if we've got a delimiter, we go round the loop twice for each
5877 * svp slot (except the last), using the delimiter the second
5886 if (SvTYPE(msv) == SVt_PVAV) {
5887 /* we've encountered an interpolated array within
5888 * the pattern, e.g. /...@a..../. Expand the list of elements,
5889 * then recursively append elements.
5890 * The code in this block is based on S_pushav() */
5892 AV *const av = (AV*)msv;
5893 const SSize_t maxarg = AvFILL(av) + 1;
5897 assert(oplist->op_type == OP_PADAV
5898 || oplist->op_type == OP_RV2AV);
5899 oplist = OpSIBLING(oplist);
5902 if (SvRMAGICAL(av)) {
5905 Newx(array, maxarg, SV*);
5907 for (i=0; i < maxarg; i++) {
5908 SV ** const svp = av_fetch(av, i, FALSE);
5909 array[i] = svp ? *svp : &PL_sv_undef;
5913 array = AvARRAY(av);
5915 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5916 array, maxarg, NULL, recompile_p,
5918 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5924 /* we make the assumption here that each op in the list of
5925 * op_siblings maps to one SV pushed onto the stack,
5926 * except for code blocks, with have both an OP_NULL and
5928 * This allows us to match up the list of SVs against the
5929 * list of OPs to find the next code block.
5931 * Note that PUSHMARK PADSV PADSV ..
5933 * PADRANGE PADSV PADSV ..
5934 * so the alignment still works. */
5937 if (oplist->op_type == OP_NULL
5938 && (oplist->op_flags & OPf_SPECIAL))
5940 assert(n < pRExC_state->num_code_blocks);
5941 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5942 pRExC_state->code_blocks[n].block = oplist;
5943 pRExC_state->code_blocks[n].src_regex = NULL;
5946 oplist = OpSIBLING(oplist); /* skip CONST */
5949 oplist = OpSIBLING(oplist);;
5952 /* apply magic and QR overloading to arg */
5955 if (SvROK(msv) && SvAMAGIC(msv)) {
5956 SV *sv = AMG_CALLunary(msv, regexp_amg);
5960 if (SvTYPE(sv) != SVt_REGEXP)
5961 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5966 /* try concatenation overload ... */
5967 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5968 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5971 /* overloading involved: all bets are off over literal
5972 * code. Pretend we haven't seen it */
5973 pRExC_state->num_code_blocks -= n;
5977 /* ... or failing that, try "" overload */
5978 while (SvAMAGIC(msv)
5979 && (sv = AMG_CALLunary(msv, string_amg))
5983 && SvRV(msv) == SvRV(sv))
5988 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5992 /* this is a partially unrolled
5993 * sv_catsv_nomg(pat, msv);
5994 * that allows us to adjust code block indices if
5997 char *dst = SvPV_force_nomg(pat, dlen);
5999 if (SvUTF8(msv) && !SvUTF8(pat)) {
6000 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
6001 sv_setpvn(pat, dst, dlen);
6004 sv_catsv_nomg(pat, msv);
6011 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
6014 /* extract any code blocks within any embedded qr//'s */
6015 if (rx && SvTYPE(rx) == SVt_REGEXP
6016 && RX_ENGINE((REGEXP*)rx)->op_comp)
6019 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
6020 if (ri->num_code_blocks) {
6022 /* the presence of an embedded qr// with code means
6023 * we should always recompile: the text of the
6024 * qr// may not have changed, but it may be a
6025 * different closure than last time */
6027 Renew(pRExC_state->code_blocks,
6028 pRExC_state->num_code_blocks + ri->num_code_blocks,
6029 struct reg_code_block);
6030 pRExC_state->num_code_blocks += ri->num_code_blocks;
6032 for (i=0; i < ri->num_code_blocks; i++) {
6033 struct reg_code_block *src, *dst;
6034 STRLEN offset = orig_patlen
6035 + ReANY((REGEXP *)rx)->pre_prefix;
6036 assert(n < pRExC_state->num_code_blocks);
6037 src = &ri->code_blocks[i];
6038 dst = &pRExC_state->code_blocks[n];
6039 dst->start = src->start + offset;
6040 dst->end = src->end + offset;
6041 dst->block = src->block;
6042 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
6051 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
6060 /* see if there are any run-time code blocks in the pattern.
6061 * False positives are allowed */
6064 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6065 char *pat, STRLEN plen)
6070 PERL_UNUSED_CONTEXT;
6072 for (s = 0; s < plen; s++) {
6073 if (n < pRExC_state->num_code_blocks
6074 && s == pRExC_state->code_blocks[n].start)
6076 s = pRExC_state->code_blocks[n].end;
6080 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
6082 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
6084 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
6091 /* Handle run-time code blocks. We will already have compiled any direct
6092 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
6093 * copy of it, but with any literal code blocks blanked out and
6094 * appropriate chars escaped; then feed it into
6096 * eval "qr'modified_pattern'"
6100 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
6104 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
6106 * After eval_sv()-ing that, grab any new code blocks from the returned qr
6107 * and merge them with any code blocks of the original regexp.
6109 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
6110 * instead, just save the qr and return FALSE; this tells our caller that
6111 * the original pattern needs upgrading to utf8.
6115 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
6116 char *pat, STRLEN plen)
6120 GET_RE_DEBUG_FLAGS_DECL;
6122 if (pRExC_state->runtime_code_qr) {
6123 /* this is the second time we've been called; this should
6124 * only happen if the main pattern got upgraded to utf8
6125 * during compilation; re-use the qr we compiled first time
6126 * round (which should be utf8 too)
6128 qr = pRExC_state->runtime_code_qr;
6129 pRExC_state->runtime_code_qr = NULL;
6130 assert(RExC_utf8 && SvUTF8(qr));
6136 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6140 /* determine how many extra chars we need for ' and \ escaping */
6141 for (s = 0; s < plen; s++) {
6142 if (pat[s] == '\'' || pat[s] == '\\')
6146 Newx(newpat, newlen, char);
6148 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6150 for (s = 0; s < plen; s++) {
6151 if (n < pRExC_state->num_code_blocks
6152 && s == pRExC_state->code_blocks[n].start)
6154 /* blank out literal code block */
6155 assert(pat[s] == '(');
6156 while (s <= pRExC_state->code_blocks[n].end) {
6164 if (pat[s] == '\'' || pat[s] == '\\')
6169 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6173 PerlIO_printf(Perl_debug_log,
6174 "%sre-parsing pattern for runtime code:%s %s\n",
6175 PL_colors[4],PL_colors[5],newpat);
6178 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6184 PUSHSTACKi(PERLSI_REQUIRE);
6185 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6186 * parsing qr''; normally only q'' does this. It also alters
6188 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6189 SvREFCNT_dec_NN(sv);
6194 SV * const errsv = ERRSV;
6195 if (SvTRUE_NN(errsv))
6197 Safefree(pRExC_state->code_blocks);
6198 /* use croak_sv ? */
6199 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6202 assert(SvROK(qr_ref));
6204 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6205 /* the leaving below frees the tmp qr_ref.
6206 * Give qr a life of its own */
6214 if (!RExC_utf8 && SvUTF8(qr)) {
6215 /* first time through; the pattern got upgraded; save the
6216 * qr for the next time through */
6217 assert(!pRExC_state->runtime_code_qr);
6218 pRExC_state->runtime_code_qr = qr;
6223 /* extract any code blocks within the returned qr// */
6226 /* merge the main (r1) and run-time (r2) code blocks into one */
6228 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6229 struct reg_code_block *new_block, *dst;
6230 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6233 if (!r2->num_code_blocks) /* we guessed wrong */
6235 SvREFCNT_dec_NN(qr);
6240 r1->num_code_blocks + r2->num_code_blocks,
6241 struct reg_code_block);
6244 while ( i1 < r1->num_code_blocks
6245 || i2 < r2->num_code_blocks)
6247 struct reg_code_block *src;
6250 if (i1 == r1->num_code_blocks) {
6251 src = &r2->code_blocks[i2++];
6254 else if (i2 == r2->num_code_blocks)
6255 src = &r1->code_blocks[i1++];
6256 else if ( r1->code_blocks[i1].start
6257 < r2->code_blocks[i2].start)
6259 src = &r1->code_blocks[i1++];
6260 assert(src->end < r2->code_blocks[i2].start);
6263 assert( r1->code_blocks[i1].start
6264 > r2->code_blocks[i2].start);
6265 src = &r2->code_blocks[i2++];
6267 assert(src->end < r1->code_blocks[i1].start);
6270 assert(pat[src->start] == '(');
6271 assert(pat[src->end] == ')');
6272 dst->start = src->start;
6273 dst->end = src->end;
6274 dst->block = src->block;
6275 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6279 r1->num_code_blocks += r2->num_code_blocks;
6280 Safefree(r1->code_blocks);
6281 r1->code_blocks = new_block;
6284 SvREFCNT_dec_NN(qr);
6290 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6291 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6292 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6293 STRLEN longest_length, bool eol, bool meol)
6295 /* This is the common code for setting up the floating and fixed length
6296 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6297 * as to whether succeeded or not */
6302 if (! (longest_length
6303 || (eol /* Can't have SEOL and MULTI */
6304 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6306 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6307 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6312 /* copy the information about the longest from the reg_scan_data
6313 over to the program. */
6314 if (SvUTF8(sv_longest)) {
6315 *rx_utf8 = sv_longest;
6318 *rx_substr = sv_longest;
6321 /* end_shift is how many chars that must be matched that
6322 follow this item. We calculate it ahead of time as once the
6323 lookbehind offset is added in we lose the ability to correctly
6325 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6326 *rx_end_shift = ml - offset
6327 - longest_length + (SvTAIL(sv_longest) != 0)
6330 t = (eol/* Can't have SEOL and MULTI */
6331 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6332 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6338 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6339 * regular expression into internal code.
6340 * The pattern may be passed either as:
6341 * a list of SVs (patternp plus pat_count)
6342 * a list of OPs (expr)
6343 * If both are passed, the SV list is used, but the OP list indicates
6344 * which SVs are actually pre-compiled code blocks
6346 * The SVs in the list have magic and qr overloading applied to them (and
6347 * the list may be modified in-place with replacement SVs in the latter
6350 * If the pattern hasn't changed from old_re, then old_re will be
6353 * eng is the current engine. If that engine has an op_comp method, then
6354 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6355 * do the initial concatenation of arguments and pass on to the external
6358 * If is_bare_re is not null, set it to a boolean indicating whether the
6359 * arg list reduced (after overloading) to a single bare regex which has
6360 * been returned (i.e. /$qr/).
6362 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6364 * pm_flags contains the PMf_* flags, typically based on those from the
6365 * pm_flags field of the related PMOP. Currently we're only interested in
6366 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6368 * We can't allocate space until we know how big the compiled form will be,
6369 * but we can't compile it (and thus know how big it is) until we've got a
6370 * place to put the code. So we cheat: we compile it twice, once with code
6371 * generation turned off and size counting turned on, and once "for real".
6372 * This also means that we don't allocate space until we are sure that the
6373 * thing really will compile successfully, and we never have to move the
6374 * code and thus invalidate pointers into it. (Note that it has to be in
6375 * one piece because free() must be able to free it all.) [NB: not true in perl]
6377 * Beware that the optimization-preparation code in here knows about some
6378 * of the structure of the compiled regexp. [I'll say.]
6382 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6383 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6384 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6388 regexp_internal *ri;
6396 SV *code_blocksv = NULL;
6397 SV** new_patternp = patternp;
6399 /* these are all flags - maybe they should be turned
6400 * into a single int with different bit masks */
6401 I32 sawlookahead = 0;
6406 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6408 bool runtime_code = 0;
6410 RExC_state_t RExC_state;
6411 RExC_state_t * const pRExC_state = &RExC_state;
6412 #ifdef TRIE_STUDY_OPT
6414 RExC_state_t copyRExC_state;
6416 GET_RE_DEBUG_FLAGS_DECL;
6418 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6420 DEBUG_r(if (!PL_colorset) reginitcolors());
6422 /* Initialize these here instead of as-needed, as is quick and avoids
6423 * having to test them each time otherwise */
6424 if (! PL_AboveLatin1) {
6425 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6426 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6427 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6428 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6429 PL_HasMultiCharFold =
6430 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6432 /* This is calculated here, because the Perl program that generates the
6433 * static global ones doesn't currently have access to
6434 * NUM_ANYOF_CODE_POINTS */
6435 PL_InBitmap = _new_invlist(2);
6436 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6437 NUM_ANYOF_CODE_POINTS - 1);
6440 pRExC_state->code_blocks = NULL;
6441 pRExC_state->num_code_blocks = 0;
6444 *is_bare_re = FALSE;
6446 if (expr && (expr->op_type == OP_LIST ||
6447 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6448 /* allocate code_blocks if needed */
6452 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
6453 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6454 ncode++; /* count of DO blocks */
6456 pRExC_state->num_code_blocks = ncode;
6457 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6462 /* compile-time pattern with just OP_CONSTs and DO blocks */
6467 /* find how many CONSTs there are */
6470 if (expr->op_type == OP_CONST)
6473 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6474 if (o->op_type == OP_CONST)
6478 /* fake up an SV array */
6480 assert(!new_patternp);
6481 Newx(new_patternp, n, SV*);
6482 SAVEFREEPV(new_patternp);
6486 if (expr->op_type == OP_CONST)
6487 new_patternp[n] = cSVOPx_sv(expr);
6489 for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
6490 if (o->op_type == OP_CONST)
6491 new_patternp[n++] = cSVOPo_sv;
6496 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6497 "Assembling pattern from %d elements%s\n", pat_count,
6498 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6500 /* set expr to the first arg op */
6502 if (pRExC_state->num_code_blocks
6503 && expr->op_type != OP_CONST)
6505 expr = cLISTOPx(expr)->op_first;
6506 assert( expr->op_type == OP_PUSHMARK
6507 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6508 || expr->op_type == OP_PADRANGE);
6509 expr = OpSIBLING(expr);
6512 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6513 expr, &recompile, NULL);
6515 /* handle bare (possibly after overloading) regex: foo =~ $re */
6520 if (SvTYPE(re) == SVt_REGEXP) {
6524 Safefree(pRExC_state->code_blocks);
6525 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6526 "Precompiled pattern%s\n",
6527 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6533 exp = SvPV_nomg(pat, plen);
6535 if (!eng->op_comp) {
6536 if ((SvUTF8(pat) && IN_BYTES)
6537 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6539 /* make a temporary copy; either to convert to bytes,
6540 * or to avoid repeating get-magic / overloaded stringify */
6541 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6542 (IN_BYTES ? 0 : SvUTF8(pat)));
6544 Safefree(pRExC_state->code_blocks);
6545 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6548 /* ignore the utf8ness if the pattern is 0 length */
6549 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6550 RExC_uni_semantics = 0;
6551 RExC_contains_locale = 0;
6552 RExC_contains_i = 0;
6553 RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
6554 pRExC_state->runtime_code_qr = NULL;
6555 RExC_frame_head= NULL;
6556 RExC_frame_last= NULL;
6557 RExC_frame_count= 0;
6560 RExC_mysv1= sv_newmortal();
6561 RExC_mysv2= sv_newmortal();
6564 SV *dsv= sv_newmortal();
6565 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6566 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6567 PL_colors[4],PL_colors[5],s);
6571 /* we jump here if we upgrade the pattern to utf8 and have to
6574 if ((pm_flags & PMf_USE_RE_EVAL)
6575 /* this second condition covers the non-regex literal case,
6576 * i.e. $foo =~ '(?{})'. */
6577 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6579 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6581 /* return old regex if pattern hasn't changed */
6582 /* XXX: note in the below we have to check the flags as well as the
6585 * Things get a touch tricky as we have to compare the utf8 flag
6586 * independently from the compile flags. */
6590 && !!RX_UTF8(old_re) == !!RExC_utf8
6591 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6592 && RX_PRECOMP(old_re)
6593 && RX_PRELEN(old_re) == plen
6594 && memEQ(RX_PRECOMP(old_re), exp, plen)
6595 && !runtime_code /* with runtime code, always recompile */ )
6597 Safefree(pRExC_state->code_blocks);
6601 rx_flags = orig_rx_flags;
6603 if (rx_flags & PMf_FOLD) {
6604 RExC_contains_i = 1;
6606 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6608 /* Set to use unicode semantics if the pattern is in utf8 and has the
6609 * 'depends' charset specified, as it means unicode when utf8 */
6610 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6614 RExC_flags = rx_flags;
6615 RExC_pm_flags = pm_flags;
6618 if (TAINTING_get && TAINT_get)
6619 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6621 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6622 /* whoops, we have a non-utf8 pattern, whilst run-time code
6623 * got compiled as utf8. Try again with a utf8 pattern */
6624 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6625 pRExC_state->num_code_blocks);
6626 goto redo_first_pass;
6629 assert(!pRExC_state->runtime_code_qr);
6635 RExC_in_lookbehind = 0;
6636 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6638 RExC_override_recoding = 0;
6640 RExC_recode_x_to_native = 0;
6642 RExC_in_multi_char_class = 0;
6644 /* First pass: determine size, legality. */
6647 RExC_end = exp + plen;
6652 RExC_emit = (regnode *) &RExC_emit_dummy;
6653 RExC_whilem_seen = 0;
6654 RExC_open_parens = NULL;
6655 RExC_close_parens = NULL;
6657 RExC_paren_names = NULL;
6659 RExC_paren_name_list = NULL;
6661 RExC_recurse = NULL;
6662 RExC_study_chunk_recursed = NULL;
6663 RExC_study_chunk_recursed_bytes= 0;
6664 RExC_recurse_count = 0;
6665 pRExC_state->code_index = 0;
6668 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6670 RExC_lastparse=NULL;
6672 /* reg may croak on us, not giving us a chance to free
6673 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6674 need it to survive as long as the regexp (qr/(?{})/).
6675 We must check that code_blocksv is not already set, because we may
6676 have jumped back to restart the sizing pass. */
6677 if (pRExC_state->code_blocks && !code_blocksv) {
6678 code_blocksv = newSV_type(SVt_PV);
6679 SAVEFREESV(code_blocksv);
6680 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6681 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6683 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6684 /* It's possible to write a regexp in ascii that represents Unicode
6685 codepoints outside of the byte range, such as via \x{100}. If we
6686 detect such a sequence we have to convert the entire pattern to utf8
6687 and then recompile, as our sizing calculation will have been based
6688 on 1 byte == 1 character, but we will need to use utf8 to encode
6689 at least some part of the pattern, and therefore must convert the whole
6692 if (flags & RESTART_UTF8) {
6693 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6694 pRExC_state->num_code_blocks);
6695 goto redo_first_pass;
6697 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6700 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6703 PerlIO_printf(Perl_debug_log,
6704 "Required size %"IVdf" nodes\n"
6705 "Starting second pass (creation)\n",
6708 RExC_lastparse=NULL;
6711 /* The first pass could have found things that force Unicode semantics */
6712 if ((RExC_utf8 || RExC_uni_semantics)
6713 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6715 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6718 /* Small enough for pointer-storage convention?
6719 If extralen==0, this means that we will not need long jumps. */
6720 if (RExC_size >= 0x10000L && RExC_extralen)
6721 RExC_size += RExC_extralen;
6724 if (RExC_whilem_seen > 15)
6725 RExC_whilem_seen = 15;
6727 /* Allocate space and zero-initialize. Note, the two step process
6728 of zeroing when in debug mode, thus anything assigned has to
6729 happen after that */
6730 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6732 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6733 char, regexp_internal);
6734 if ( r == NULL || ri == NULL )
6735 FAIL("Regexp out of space");
6737 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6738 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6741 /* bulk initialize base fields with 0. */
6742 Zero(ri, sizeof(regexp_internal), char);
6745 /* non-zero initialization begins here */
6748 r->extflags = rx_flags;
6749 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6751 if (pm_flags & PMf_IS_QR) {
6752 ri->code_blocks = pRExC_state->code_blocks;
6753 ri->num_code_blocks = pRExC_state->num_code_blocks;
6758 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6759 if (pRExC_state->code_blocks[n].src_regex)
6760 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6761 SAVEFREEPV(pRExC_state->code_blocks);
6765 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6766 bool has_charset = (get_regex_charset(r->extflags)
6767 != REGEX_DEPENDS_CHARSET);
6769 /* The caret is output if there are any defaults: if not all the STD
6770 * flags are set, or if no character set specifier is needed */
6772 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6774 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6775 == REG_RUN_ON_COMMENT_SEEN);
6776 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6777 >> RXf_PMf_STD_PMMOD_SHIFT);
6778 const char *fptr = STD_PAT_MODS; /*"msixn"*/
6780 /* Allocate for the worst case, which is all the std flags are turned
6781 * on. If more precision is desired, we could do a population count of
6782 * the flags set. This could be done with a small lookup table, or by
6783 * shifting, masking and adding, or even, when available, assembly
6784 * language for a machine-language population count.
6785 * We never output a minus, as all those are defaults, so are
6786 * covered by the caret */
6787 const STRLEN wraplen = plen + has_p + has_runon
6788 + has_default /* If needs a caret */
6790 /* If needs a character set specifier */
6791 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6792 + (sizeof(STD_PAT_MODS) - 1)
6793 + (sizeof("(?:)") - 1);
6795 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6796 r->xpv_len_u.xpvlenu_pv = p;
6798 SvFLAGS(rx) |= SVf_UTF8;
6801 /* If a default, cover it using the caret */
6803 *p++= DEFAULT_PAT_MOD;
6807 const char* const name = get_regex_charset_name(r->extflags, &len);
6808 Copy(name, p, len, char);
6812 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6815 while((ch = *fptr++)) {
6823 Copy(RExC_precomp, p, plen, char);
6824 assert ((RX_WRAPPED(rx) - p) < 16);
6825 r->pre_prefix = p - RX_WRAPPED(rx);
6831 SvCUR_set(rx, p - RX_WRAPPED(rx));
6835 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6837 /* setup various meta data about recursion, this all requires
6838 * RExC_npar to be correctly set, and a bit later on we clear it */
6839 if (RExC_seen & REG_RECURSE_SEEN) {
6840 Newxz(RExC_open_parens, RExC_npar,regnode *);
6841 SAVEFREEPV(RExC_open_parens);
6842 Newxz(RExC_close_parens,RExC_npar,regnode *);
6843 SAVEFREEPV(RExC_close_parens);
6845 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6846 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6847 * So its 1 if there are no parens. */
6848 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6849 ((RExC_npar & 0x07) != 0);
6850 Newx(RExC_study_chunk_recursed,
6851 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6852 SAVEFREEPV(RExC_study_chunk_recursed);
6855 /* Useful during FAIL. */
6856 #ifdef RE_TRACK_PATTERN_OFFSETS
6857 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6858 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6859 "%s %"UVuf" bytes for offset annotations.\n",
6860 ri->u.offsets ? "Got" : "Couldn't get",
6861 (UV)((2*RExC_size+1) * sizeof(U32))));
6863 SetProgLen(ri,RExC_size);
6867 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6869 /* Second pass: emit code. */
6870 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6871 RExC_pm_flags = pm_flags;
6873 RExC_end = exp + plen;
6876 RExC_emit_start = ri->program;
6877 RExC_emit = ri->program;
6878 RExC_emit_bound = ri->program + RExC_size + 1;
6879 pRExC_state->code_index = 0;
6881 *((char*) RExC_emit++) = (char) REG_MAGIC;
6882 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6884 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6886 /* XXXX To minimize changes to RE engine we always allocate
6887 3-units-long substrs field. */
6888 Newx(r->substrs, 1, struct reg_substr_data);
6889 if (RExC_recurse_count) {
6890 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6891 SAVEFREEPV(RExC_recurse);
6895 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6897 RExC_study_chunk_recursed_count= 0;
6899 Zero(r->substrs, 1, struct reg_substr_data);
6900 if (RExC_study_chunk_recursed) {
6901 Zero(RExC_study_chunk_recursed,
6902 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6906 #ifdef TRIE_STUDY_OPT
6908 StructCopy(&zero_scan_data, &data, scan_data_t);
6909 copyRExC_state = RExC_state;
6912 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6914 RExC_state = copyRExC_state;
6915 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6916 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6918 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6919 StructCopy(&zero_scan_data, &data, scan_data_t);
6922 StructCopy(&zero_scan_data, &data, scan_data_t);
6925 /* Dig out information for optimizations. */
6926 r->extflags = RExC_flags; /* was pm_op */
6927 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6930 SvUTF8_on(rx); /* Unicode in it? */
6931 ri->regstclass = NULL;
6932 if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
6933 r->intflags |= PREGf_NAUGHTY;
6934 scan = ri->program + 1; /* First BRANCH. */
6936 /* testing for BRANCH here tells us whether there is "must appear"
6937 data in the pattern. If there is then we can use it for optimisations */
6938 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6941 STRLEN longest_float_length, longest_fixed_length;
6942 regnode_ssc ch_class; /* pointed to by data */
6944 SSize_t last_close = 0; /* pointed to by data */
6945 regnode *first= scan;
6946 regnode *first_next= regnext(first);
6948 * Skip introductions and multiplicators >= 1
6949 * so that we can extract the 'meat' of the pattern that must
6950 * match in the large if() sequence following.
6951 * NOTE that EXACT is NOT covered here, as it is normally
6952 * picked up by the optimiser separately.
6954 * This is unfortunate as the optimiser isnt handling lookahead
6955 * properly currently.
6958 while ((OP(first) == OPEN && (sawopen = 1)) ||
6959 /* An OR of *one* alternative - should not happen now. */
6960 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6961 /* for now we can't handle lookbehind IFMATCH*/
6962 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6963 (OP(first) == PLUS) ||
6964 (OP(first) == MINMOD) ||
6965 /* An {n,m} with n>0 */
6966 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6967 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6970 * the only op that could be a regnode is PLUS, all the rest
6971 * will be regnode_1 or regnode_2.
6973 * (yves doesn't think this is true)
6975 if (OP(first) == PLUS)
6978 if (OP(first) == MINMOD)
6980 first += regarglen[OP(first)];
6982 first = NEXTOPER(first);
6983 first_next= regnext(first);
6986 /* Starting-point info. */
6988 DEBUG_PEEP("first:",first,0);
6989 /* Ignore EXACT as we deal with it later. */
6990 if (PL_regkind[OP(first)] == EXACT) {
6991 if (OP(first) == EXACT || OP(first) == EXACTL)
6992 NOOP; /* Empty, get anchored substr later. */
6994 ri->regstclass = first;
6997 else if (PL_regkind[OP(first)] == TRIE &&
6998 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
7000 /* this can happen only on restudy */
7001 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
7004 else if (REGNODE_SIMPLE(OP(first)))
7005 ri->regstclass = first;
7006 else if (PL_regkind[OP(first)] == BOUND ||
7007 PL_regkind[OP(first)] == NBOUND)
7008 ri->regstclass = first;
7009 else if (PL_regkind[OP(first)] == BOL) {
7010 r->intflags |= (OP(first) == MBOL
7013 first = NEXTOPER(first);
7016 else if (OP(first) == GPOS) {
7017 r->intflags |= PREGf_ANCH_GPOS;
7018 first = NEXTOPER(first);
7021 else if ((!sawopen || !RExC_sawback) &&
7023 (OP(first) == STAR &&
7024 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
7025 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
7027 /* turn .* into ^.* with an implied $*=1 */
7029 (OP(NEXTOPER(first)) == REG_ANY)
7032 r->intflags |= (type | PREGf_IMPLICIT);
7033 first = NEXTOPER(first);
7036 if (sawplus && !sawminmod && !sawlookahead
7037 && (!sawopen || !RExC_sawback)
7038 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
7039 /* x+ must match at the 1st pos of run of x's */
7040 r->intflags |= PREGf_SKIP;
7042 /* Scan is after the zeroth branch, first is atomic matcher. */
7043 #ifdef TRIE_STUDY_OPT
7046 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7047 (IV)(first - scan + 1))
7051 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
7052 (IV)(first - scan + 1))
7058 * If there's something expensive in the r.e., find the
7059 * longest literal string that must appear and make it the
7060 * regmust. Resolve ties in favor of later strings, since
7061 * the regstart check works with the beginning of the r.e.
7062 * and avoiding duplication strengthens checking. Not a
7063 * strong reason, but sufficient in the absence of others.
7064 * [Now we resolve ties in favor of the earlier string if
7065 * it happens that c_offset_min has been invalidated, since the
7066 * earlier string may buy us something the later one won't.]
7069 data.longest_fixed = newSVpvs("");
7070 data.longest_float = newSVpvs("");
7071 data.last_found = newSVpvs("");
7072 data.longest = &(data.longest_fixed);
7073 ENTER_with_name("study_chunk");
7074 SAVEFREESV(data.longest_fixed);
7075 SAVEFREESV(data.longest_float);
7076 SAVEFREESV(data.last_found);
7078 if (!ri->regstclass) {
7079 ssc_init(pRExC_state, &ch_class);
7080 data.start_class = &ch_class;
7081 stclass_flag = SCF_DO_STCLASS_AND;
7082 } else /* XXXX Check for BOUND? */
7084 data.last_closep = &last_close;
7087 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
7088 scan + RExC_size, /* Up to end */
7090 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
7091 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
7095 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
7098 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
7099 && data.last_start_min == 0 && data.last_end > 0
7100 && !RExC_seen_zerolen
7101 && !(RExC_seen & REG_VERBARG_SEEN)
7102 && !(RExC_seen & REG_GPOS_SEEN)
7104 r->extflags |= RXf_CHECK_ALL;
7106 scan_commit(pRExC_state, &data,&minlen,0);
7108 longest_float_length = CHR_SVLEN(data.longest_float);
7110 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
7111 && data.offset_fixed == data.offset_float_min
7112 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
7113 && S_setup_longest (aTHX_ pRExC_state,
7117 &(r->float_end_shift),
7118 data.lookbehind_float,
7119 data.offset_float_min,
7121 longest_float_length,
7122 cBOOL(data.flags & SF_FL_BEFORE_EOL),
7123 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
7125 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
7126 r->float_max_offset = data.offset_float_max;
7127 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
7128 r->float_max_offset -= data.lookbehind_float;
7129 SvREFCNT_inc_simple_void_NN(data.longest_float);
7132 r->float_substr = r->float_utf8 = NULL;
7133 longest_float_length = 0;
7136 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7138 if (S_setup_longest (aTHX_ pRExC_state,
7140 &(r->anchored_utf8),
7141 &(r->anchored_substr),
7142 &(r->anchored_end_shift),
7143 data.lookbehind_fixed,
7146 longest_fixed_length,
7147 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7148 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7150 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7151 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7154 r->anchored_substr = r->anchored_utf8 = NULL;
7155 longest_fixed_length = 0;
7157 LEAVE_with_name("study_chunk");
7160 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7161 ri->regstclass = NULL;
7163 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7165 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7166 && is_ssc_worth_it(pRExC_state, data.start_class))
7168 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7170 ssc_finalize(pRExC_state, data.start_class);
7172 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7173 StructCopy(data.start_class,
7174 (regnode_ssc*)RExC_rxi->data->data[n],
7176 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7177 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7178 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7179 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7180 PerlIO_printf(Perl_debug_log,
7181 "synthetic stclass \"%s\".\n",
7182 SvPVX_const(sv));});
7183 data.start_class = NULL;
7186 /* A temporary algorithm prefers floated substr to fixed one to dig
7188 if (longest_fixed_length > longest_float_length) {
7189 r->substrs->check_ix = 0;
7190 r->check_end_shift = r->anchored_end_shift;
7191 r->check_substr = r->anchored_substr;
7192 r->check_utf8 = r->anchored_utf8;
7193 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7194 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7195 r->intflags |= PREGf_NOSCAN;
7198 r->substrs->check_ix = 1;
7199 r->check_end_shift = r->float_end_shift;
7200 r->check_substr = r->float_substr;
7201 r->check_utf8 = r->float_utf8;
7202 r->check_offset_min = r->float_min_offset;
7203 r->check_offset_max = r->float_max_offset;
7205 if ((r->check_substr || r->check_utf8) ) {
7206 r->extflags |= RXf_USE_INTUIT;
7207 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7208 r->extflags |= RXf_INTUIT_TAIL;
7210 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7212 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7213 if ( (STRLEN)minlen < longest_float_length )
7214 minlen= longest_float_length;
7215 if ( (STRLEN)minlen < longest_fixed_length )
7216 minlen= longest_fixed_length;
7220 /* Several toplevels. Best we can is to set minlen. */
7222 regnode_ssc ch_class;
7223 SSize_t last_close = 0;
7225 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7227 scan = ri->program + 1;
7228 ssc_init(pRExC_state, &ch_class);
7229 data.start_class = &ch_class;
7230 data.last_closep = &last_close;
7233 minlen = study_chunk(pRExC_state,
7234 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7235 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7236 ? SCF_TRIE_DOING_RESTUDY
7240 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7242 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7243 = r->float_substr = r->float_utf8 = NULL;
7245 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7246 && is_ssc_worth_it(pRExC_state, data.start_class))
7248 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7250 ssc_finalize(pRExC_state, data.start_class);
7252 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7253 StructCopy(data.start_class,
7254 (regnode_ssc*)RExC_rxi->data->data[n],
7256 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7257 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7258 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7259 regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
7260 PerlIO_printf(Perl_debug_log,
7261 "synthetic stclass \"%s\".\n",
7262 SvPVX_const(sv));});
7263 data.start_class = NULL;
7267 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7268 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7269 r->maxlen = REG_INFTY;
7272 r->maxlen = RExC_maxlen;
7275 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7276 the "real" pattern. */
7278 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7279 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7281 r->minlenret = minlen;
7282 if (r->minlen < minlen)
7285 if (RExC_seen & REG_GPOS_SEEN)
7286 r->intflags |= PREGf_GPOS_SEEN;
7287 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7288 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7290 if (pRExC_state->num_code_blocks)
7291 r->extflags |= RXf_EVAL_SEEN;
7292 if (RExC_seen & REG_VERBARG_SEEN)
7294 r->intflags |= PREGf_VERBARG_SEEN;
7295 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7297 if (RExC_seen & REG_CUTGROUP_SEEN)
7298 r->intflags |= PREGf_CUTGROUP_SEEN;
7299 if (pm_flags & PMf_USE_RE_EVAL)
7300 r->intflags |= PREGf_USE_RE_EVAL;
7301 if (RExC_paren_names)
7302 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7304 RXp_PAREN_NAMES(r) = NULL;
7306 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7307 * so it can be used in pp.c */
7308 if (r->intflags & PREGf_ANCH)
7309 r->extflags |= RXf_IS_ANCHORED;
7313 /* this is used to identify "special" patterns that might result
7314 * in Perl NOT calling the regex engine and instead doing the match "itself",
7315 * particularly special cases in split//. By having the regex compiler
7316 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7317 * we avoid weird issues with equivalent patterns resulting in different behavior,
7318 * AND we allow non Perl engines to get the same optimizations by the setting the
7319 * flags appropriately - Yves */
7320 regnode *first = ri->program + 1;
7322 regnode *next = regnext(first);
7325 if (PL_regkind[fop] == NOTHING && nop == END)
7326 r->extflags |= RXf_NULL;
7327 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7328 /* when fop is SBOL first->flags will be true only when it was
7329 * produced by parsing /\A/, and not when parsing /^/. This is
7330 * very important for the split code as there we want to
7331 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7332 * See rt #122761 for more details. -- Yves */
7333 r->extflags |= RXf_START_ONLY;
7334 else if (fop == PLUS
7335 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7337 r->extflags |= RXf_WHITE;
7338 else if ( r->extflags & RXf_SPLIT
7339 && (fop == EXACT || fop == EXACTL)
7340 && STR_LEN(first) == 1
7341 && *(STRING(first)) == ' '
7343 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7347 if (RExC_contains_locale) {
7348 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7352 if (RExC_paren_names) {
7353 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7354 ri->data->data[ri->name_list_idx]
7355 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7358 ri->name_list_idx = 0;
7360 if (RExC_recurse_count) {
7361 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7362 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7363 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7366 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7367 /* assume we don't need to swap parens around before we match */
7369 PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
7370 (unsigned long)RExC_study_chunk_recursed_count);
7374 PerlIO_printf(Perl_debug_log,"Final program:\n");
7377 #ifdef RE_TRACK_PATTERN_OFFSETS
7378 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7379 const STRLEN len = ri->u.offsets[0];
7381 GET_RE_DEBUG_FLAGS_DECL;
7382 PerlIO_printf(Perl_debug_log,
7383 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7384 for (i = 1; i <= len; i++) {
7385 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7386 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7387 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7389 PerlIO_printf(Perl_debug_log, "\n");
7394 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7395 * by setting the regexp SV to readonly-only instead. If the
7396 * pattern's been recompiled, the USEDness should remain. */
7397 if (old_re && SvREADONLY(old_re))
7405 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7408 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7410 PERL_UNUSED_ARG(value);
7412 if (flags & RXapif_FETCH) {
7413 return reg_named_buff_fetch(rx, key, flags);
7414 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7415 Perl_croak_no_modify();
7417 } else if (flags & RXapif_EXISTS) {
7418 return reg_named_buff_exists(rx, key, flags)
7421 } else if (flags & RXapif_REGNAMES) {
7422 return reg_named_buff_all(rx, flags);
7423 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7424 return reg_named_buff_scalar(rx, flags);
7426 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7432 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7435 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7436 PERL_UNUSED_ARG(lastkey);
7438 if (flags & RXapif_FIRSTKEY)
7439 return reg_named_buff_firstkey(rx, flags);
7440 else if (flags & RXapif_NEXTKEY)
7441 return reg_named_buff_nextkey(rx, flags);
7443 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7450 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7453 AV *retarray = NULL;
7455 struct regexp *const rx = ReANY(r);
7457 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7459 if (flags & RXapif_ALL)
7462 if (rx && RXp_PAREN_NAMES(rx)) {
7463 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7466 SV* sv_dat=HeVAL(he_str);
7467 I32 *nums=(I32*)SvPVX(sv_dat);
7468 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7469 if ((I32)(rx->nparens) >= nums[i]
7470 && rx->offs[nums[i]].start != -1
7471 && rx->offs[nums[i]].end != -1)
7474 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7479 ret = newSVsv(&PL_sv_undef);
7482 av_push(retarray, ret);
7485 return newRV_noinc(MUTABLE_SV(retarray));
7492 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7495 struct regexp *const rx = ReANY(r);
7497 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7499 if (rx && RXp_PAREN_NAMES(rx)) {
7500 if (flags & RXapif_ALL) {
7501 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7503 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7505 SvREFCNT_dec_NN(sv);
7517 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7519 struct regexp *const rx = ReANY(r);
7521 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7523 if ( rx && RXp_PAREN_NAMES(rx) ) {
7524 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7526 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7533 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7535 struct regexp *const rx = ReANY(r);
7536 GET_RE_DEBUG_FLAGS_DECL;
7538 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7540 if (rx && RXp_PAREN_NAMES(rx)) {
7541 HV *hv = RXp_PAREN_NAMES(rx);
7543 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7546 SV* sv_dat = HeVAL(temphe);
7547 I32 *nums = (I32*)SvPVX(sv_dat);
7548 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7549 if ((I32)(rx->lastparen) >= nums[i] &&
7550 rx->offs[nums[i]].start != -1 &&
7551 rx->offs[nums[i]].end != -1)
7557 if (parno || flags & RXapif_ALL) {
7558 return newSVhek(HeKEY_hek(temphe));
7566 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7571 struct regexp *const rx = ReANY(r);
7573 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7575 if (rx && RXp_PAREN_NAMES(rx)) {
7576 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7577 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7578 } else if (flags & RXapif_ONE) {
7579 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7580 av = MUTABLE_AV(SvRV(ret));
7581 length = av_tindex(av);
7582 SvREFCNT_dec_NN(ret);
7583 return newSViv(length + 1);
7585 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7590 return &PL_sv_undef;
7594 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7596 struct regexp *const rx = ReANY(r);
7599 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7601 if (rx && RXp_PAREN_NAMES(rx)) {
7602 HV *hv= RXp_PAREN_NAMES(rx);
7604 (void)hv_iterinit(hv);
7605 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7608 SV* sv_dat = HeVAL(temphe);
7609 I32 *nums = (I32*)SvPVX(sv_dat);
7610 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7611 if ((I32)(rx->lastparen) >= nums[i] &&
7612 rx->offs[nums[i]].start != -1 &&
7613 rx->offs[nums[i]].end != -1)
7619 if (parno || flags & RXapif_ALL) {
7620 av_push(av, newSVhek(HeKEY_hek(temphe)));
7625 return newRV_noinc(MUTABLE_SV(av));
7629 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7632 struct regexp *const rx = ReANY(r);
7638 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7640 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7641 || n == RX_BUFF_IDX_CARET_FULLMATCH
7642 || n == RX_BUFF_IDX_CARET_POSTMATCH
7645 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7647 /* on something like
7650 * the KEEPCOPY is set on the PMOP rather than the regex */
7651 if (PL_curpm && r == PM_GETRE(PL_curpm))
7652 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7661 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7662 /* no need to distinguish between them any more */
7663 n = RX_BUFF_IDX_FULLMATCH;
7665 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7666 && rx->offs[0].start != -1)
7668 /* $`, ${^PREMATCH} */
7669 i = rx->offs[0].start;
7673 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7674 && rx->offs[0].end != -1)
7676 /* $', ${^POSTMATCH} */
7677 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7678 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7681 if ( 0 <= n && n <= (I32)rx->nparens &&
7682 (s1 = rx->offs[n].start) != -1 &&
7683 (t1 = rx->offs[n].end) != -1)
7685 /* $&, ${^MATCH}, $1 ... */
7687 s = rx->subbeg + s1 - rx->suboffset;
7692 assert(s >= rx->subbeg);
7693 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7695 #ifdef NO_TAINT_SUPPORT
7696 sv_setpvn(sv, s, i);
7698 const int oldtainted = TAINT_get;
7700 sv_setpvn(sv, s, i);
7701 TAINT_set(oldtainted);
7703 if (RXp_MATCH_UTF8(rx))
7708 if (RXp_MATCH_TAINTED(rx)) {
7709 if (SvTYPE(sv) >= SVt_PVMG) {
7710 MAGIC* const mg = SvMAGIC(sv);
7713 SvMAGIC_set(sv, mg->mg_moremagic);
7715 if ((mgt = SvMAGIC(sv))) {
7716 mg->mg_moremagic = mgt;
7717 SvMAGIC_set(sv, mg);
7728 sv_setsv(sv,&PL_sv_undef);
7734 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7735 SV const * const value)
7737 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7739 PERL_UNUSED_ARG(rx);
7740 PERL_UNUSED_ARG(paren);
7741 PERL_UNUSED_ARG(value);
7744 Perl_croak_no_modify();
7748 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7751 struct regexp *const rx = ReANY(r);
7755 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7757 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7758 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7759 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7762 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7764 /* on something like
7767 * the KEEPCOPY is set on the PMOP rather than the regex */
7768 if (PL_curpm && r == PM_GETRE(PL_curpm))
7769 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7775 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7777 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7778 case RX_BUFF_IDX_PREMATCH: /* $` */
7779 if (rx->offs[0].start != -1) {
7780 i = rx->offs[0].start;
7789 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7790 case RX_BUFF_IDX_POSTMATCH: /* $' */
7791 if (rx->offs[0].end != -1) {
7792 i = rx->sublen - rx->offs[0].end;
7794 s1 = rx->offs[0].end;
7801 default: /* $& / ${^MATCH}, $1, $2, ... */
7802 if (paren <= (I32)rx->nparens &&
7803 (s1 = rx->offs[paren].start) != -1 &&
7804 (t1 = rx->offs[paren].end) != -1)
7810 if (ckWARN(WARN_UNINITIALIZED))
7811 report_uninit((const SV *)sv);
7816 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7817 const char * const s = rx->subbeg - rx->suboffset + s1;
7822 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7829 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7831 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7832 PERL_UNUSED_ARG(rx);
7836 return newSVpvs("Regexp");
7839 /* Scans the name of a named buffer from the pattern.
7840 * If flags is REG_RSN_RETURN_NULL returns null.
7841 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7842 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7843 * to the parsed name as looked up in the RExC_paren_names hash.
7844 * If there is an error throws a vFAIL().. type exception.
7847 #define REG_RSN_RETURN_NULL 0
7848 #define REG_RSN_RETURN_NAME 1
7849 #define REG_RSN_RETURN_DATA 2
7852 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7854 char *name_start = RExC_parse;
7856 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7858 assert (RExC_parse <= RExC_end);
7859 if (RExC_parse == RExC_end) NOOP;
7860 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7861 /* skip IDFIRST by using do...while */
7864 RExC_parse += UTF8SKIP(RExC_parse);
7865 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7869 } while (isWORDCHAR(*RExC_parse));
7871 RExC_parse++; /* so the <- from the vFAIL is after the offending
7873 vFAIL("Group name must start with a non-digit word character");
7877 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7878 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7879 if ( flags == REG_RSN_RETURN_NAME)
7881 else if (flags==REG_RSN_RETURN_DATA) {
7884 if ( ! sv_name ) /* should not happen*/
7885 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7886 if (RExC_paren_names)
7887 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7889 sv_dat = HeVAL(he_str);
7891 vFAIL("Reference to nonexistent named group");
7895 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7896 (unsigned long) flags);
7898 NOT_REACHED; /* NOTREACHED */
7903 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7905 if (RExC_lastparse!=RExC_parse) { \
7906 PerlIO_printf(Perl_debug_log, "%s", \
7907 Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
7908 RExC_end - RExC_parse, 16, \
7910 PERL_PV_ESCAPE_UNI_DETECT | \
7911 PERL_PV_PRETTY_ELLIPSES | \
7912 PERL_PV_PRETTY_LTGT | \
7913 PERL_PV_ESCAPE_RE | \
7914 PERL_PV_PRETTY_EXACTSIZE \
7918 PerlIO_printf(Perl_debug_log,"%16s",""); \
7921 num = RExC_size + 1; \
7923 num=REG_NODE_NUM(RExC_emit); \
7924 if (RExC_lastnum!=num) \
7925 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7927 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7928 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7929 (int)((depth*2)), "", \
7933 RExC_lastparse=RExC_parse; \
7938 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7939 DEBUG_PARSE_MSG((funcname)); \
7940 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7942 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7943 DEBUG_PARSE_MSG((funcname)); \
7944 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7947 /* This section of code defines the inversion list object and its methods. The
7948 * interfaces are highly subject to change, so as much as possible is static to
7949 * this file. An inversion list is here implemented as a malloc'd C UV array
7950 * as an SVt_INVLIST scalar.
7952 * An inversion list for Unicode is an array of code points, sorted by ordinal
7953 * number. The zeroth element is the first code point in the list. The 1th
7954 * element is the first element beyond that not in the list. In other words,
7955 * the first range is
7956 * invlist[0]..(invlist[1]-1)
7957 * The other ranges follow. Thus every element whose index is divisible by two
7958 * marks the beginning of a range that is in the list, and every element not
7959 * divisible by two marks the beginning of a range not in the list. A single
7960 * element inversion list that contains the single code point N generally
7961 * consists of two elements
7964 * (The exception is when N is the highest representable value on the
7965 * machine, in which case the list containing just it would be a single
7966 * element, itself. By extension, if the last range in the list extends to
7967 * infinity, then the first element of that range will be in the inversion list
7968 * at a position that is divisible by two, and is the final element in the
7970 * Taking the complement (inverting) an inversion list is quite simple, if the
7971 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7972 * This implementation reserves an element at the beginning of each inversion
7973 * list to always contain 0; there is an additional flag in the header which
7974 * indicates if the list begins at the 0, or is offset to begin at the next
7977 * More about inversion lists can be found in "Unicode Demystified"
7978 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7979 * More will be coming when functionality is added later.
7981 * The inversion list data structure is currently implemented as an SV pointing
7982 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7983 * array of UV whose memory management is automatically handled by the existing
7984 * facilities for SV's.
7986 * Some of the methods should always be private to the implementation, and some
7987 * should eventually be made public */
7989 /* The header definitions are in F<inline_invlist.c> */
7991 PERL_STATIC_INLINE UV*
7992 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7994 /* Returns a pointer to the first element in the inversion list's array.
7995 * This is called upon initialization of an inversion list. Where the
7996 * array begins depends on whether the list has the code point U+0000 in it
7997 * or not. The other parameter tells it whether the code that follows this
7998 * call is about to put a 0 in the inversion list or not. The first
7999 * element is either the element reserved for 0, if TRUE, or the element
8000 * after it, if FALSE */
8002 bool* offset = get_invlist_offset_addr(invlist);
8003 UV* zero_addr = (UV *) SvPVX(invlist);
8005 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
8008 assert(! _invlist_len(invlist));
8012 /* 1^1 = 0; 1^0 = 1 */
8013 *offset = 1 ^ will_have_0;
8014 return zero_addr + *offset;
8017 PERL_STATIC_INLINE void
8018 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
8020 /* Sets the current number of elements stored in the inversion list.
8021 * Updates SvCUR correspondingly */
8022 PERL_UNUSED_CONTEXT;
8023 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
8025 assert(SvTYPE(invlist) == SVt_INVLIST);
8030 : TO_INTERNAL_SIZE(len + offset));
8031 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
8034 #ifndef PERL_IN_XSUB_RE
8036 PERL_STATIC_INLINE IV*
8037 S_get_invlist_previous_index_addr(SV* invlist)
8039 /* Return the address of the IV that is reserved to hold the cached index
8041 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
8043 assert(SvTYPE(invlist) == SVt_INVLIST);
8045 return &(((XINVLIST*) SvANY(invlist))->prev_index);
8048 PERL_STATIC_INLINE IV
8049 S_invlist_previous_index(SV* const invlist)
8051 /* Returns cached index of previous search */
8053 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
8055 return *get_invlist_previous_index_addr(invlist);
8058 PERL_STATIC_INLINE void
8059 S_invlist_set_previous_index(SV* const invlist, const IV index)
8061 /* Caches <index> for later retrieval */
8063 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
8065 assert(index == 0 || index < (int) _invlist_len(invlist));
8067 *get_invlist_previous_index_addr(invlist) = index;
8070 PERL_STATIC_INLINE void
8071 S_invlist_trim(SV* const invlist)
8073 PERL_ARGS_ASSERT_INVLIST_TRIM;
8075 assert(SvTYPE(invlist) == SVt_INVLIST);
8077 /* Change the length of the inversion list to how many entries it currently
8079 SvPV_shrink_to_cur((SV *) invlist);
8082 PERL_STATIC_INLINE bool
8083 S_invlist_is_iterating(SV* const invlist)
8085 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
8087 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
8090 #endif /* ifndef PERL_IN_XSUB_RE */
8092 PERL_STATIC_INLINE UV
8093 S_invlist_max(SV* const invlist)
8095 /* Returns the maximum number of elements storable in the inversion list's
8096 * array, without having to realloc() */
8098 PERL_ARGS_ASSERT_INVLIST_MAX;
8100 assert(SvTYPE(invlist) == SVt_INVLIST);
8102 /* Assumes worst case, in which the 0 element is not counted in the
8103 * inversion list, so subtracts 1 for that */
8104 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
8105 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
8106 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
8109 #ifndef PERL_IN_XSUB_RE
8111 Perl__new_invlist(pTHX_ IV initial_size)
8114 /* Return a pointer to a newly constructed inversion list, with enough
8115 * space to store 'initial_size' elements. If that number is negative, a
8116 * system default is used instead */
8120 if (initial_size < 0) {
8124 /* Allocate the initial space */
8125 new_list = newSV_type(SVt_INVLIST);
8127 /* First 1 is in case the zero element isn't in the list; second 1 is for
8129 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
8130 invlist_set_len(new_list, 0, 0);
8132 /* Force iterinit() to be used to get iteration to work */
8133 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8135 *get_invlist_previous_index_addr(new_list) = 0;
8141 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8143 /* Return a pointer to a newly constructed inversion list, initialized to
8144 * point to <list>, which has to be in the exact correct inversion list
8145 * form, including internal fields. Thus this is a dangerous routine that
8146 * should not be used in the wrong hands. The passed in 'list' contains
8147 * several header fields at the beginning that are not part of the
8148 * inversion list body proper */
8150 const STRLEN length = (STRLEN) list[0];
8151 const UV version_id = list[1];
8152 const bool offset = cBOOL(list[2]);
8153 #define HEADER_LENGTH 3
8154 /* If any of the above changes in any way, you must change HEADER_LENGTH
8155 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8156 * perl -E 'say int(rand 2**31-1)'
8158 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8159 data structure type, so that one being
8160 passed in can be validated to be an
8161 inversion list of the correct vintage.
8164 SV* invlist = newSV_type(SVt_INVLIST);
8166 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8168 if (version_id != INVLIST_VERSION_ID) {
8169 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8172 /* The generated array passed in includes header elements that aren't part
8173 * of the list proper, so start it just after them */
8174 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8176 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8177 shouldn't touch it */
8179 *(get_invlist_offset_addr(invlist)) = offset;
8181 /* The 'length' passed to us is the physical number of elements in the
8182 * inversion list. But if there is an offset the logical number is one
8184 invlist_set_len(invlist, length - offset, offset);
8186 invlist_set_previous_index(invlist, 0);
8188 /* Initialize the iteration pointer. */
8189 invlist_iterfinish(invlist);
8191 SvREADONLY_on(invlist);
8195 #endif /* ifndef PERL_IN_XSUB_RE */
8198 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8200 /* Grow the maximum size of an inversion list */
8202 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8204 assert(SvTYPE(invlist) == SVt_INVLIST);
8206 /* Add one to account for the zero element at the beginning which may not
8207 * be counted by the calling parameters */
8208 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8212 S__append_range_to_invlist(pTHX_ SV* const invlist,
8213 const UV start, const UV end)
8215 /* Subject to change or removal. Append the range from 'start' to 'end' at
8216 * the end of the inversion list. The range must be above any existing
8220 UV max = invlist_max(invlist);
8221 UV len = _invlist_len(invlist);
8224 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8226 if (len == 0) { /* Empty lists must be initialized */
8227 offset = start != 0;
8228 array = _invlist_array_init(invlist, ! offset);
8231 /* Here, the existing list is non-empty. The current max entry in the
8232 * list is generally the first value not in the set, except when the
8233 * set extends to the end of permissible values, in which case it is
8234 * the first entry in that final set, and so this call is an attempt to
8235 * append out-of-order */
8237 UV final_element = len - 1;
8238 array = invlist_array(invlist);
8239 if (array[final_element] > start
8240 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8242 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8243 array[final_element], start,
8244 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8247 /* Here, it is a legal append. If the new range begins with the first
8248 * value not in the set, it is extending the set, so the new first
8249 * value not in the set is one greater than the newly extended range.
8251 offset = *get_invlist_offset_addr(invlist);
8252 if (array[final_element] == start) {
8253 if (end != UV_MAX) {
8254 array[final_element] = end + 1;
8257 /* But if the end is the maximum representable on the machine,
8258 * just let the range that this would extend to have no end */
8259 invlist_set_len(invlist, len - 1, offset);
8265 /* Here the new range doesn't extend any existing set. Add it */
8267 len += 2; /* Includes an element each for the start and end of range */
8269 /* If wll overflow the existing space, extend, which may cause the array to
8272 invlist_extend(invlist, len);
8274 /* Have to set len here to avoid assert failure in invlist_array() */
8275 invlist_set_len(invlist, len, offset);
8277 array = invlist_array(invlist);
8280 invlist_set_len(invlist, len, offset);
8283 /* The next item on the list starts the range, the one after that is
8284 * one past the new range. */
8285 array[len - 2] = start;
8286 if (end != UV_MAX) {
8287 array[len - 1] = end + 1;
8290 /* But if the end is the maximum representable on the machine, just let
8291 * the range have no end */
8292 invlist_set_len(invlist, len - 1, offset);
8296 #ifndef PERL_IN_XSUB_RE
8299 Perl__invlist_search(SV* const invlist, const UV cp)
8301 /* Searches the inversion list for the entry that contains the input code
8302 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8303 * return value is the index into the list's array of the range that
8308 IV high = _invlist_len(invlist);
8309 const IV highest_element = high - 1;
8312 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8314 /* If list is empty, return failure. */
8319 /* (We can't get the array unless we know the list is non-empty) */
8320 array = invlist_array(invlist);
8322 mid = invlist_previous_index(invlist);
8323 assert(mid >=0 && mid <= highest_element);
8325 /* <mid> contains the cache of the result of the previous call to this
8326 * function (0 the first time). See if this call is for the same result,
8327 * or if it is for mid-1. This is under the theory that calls to this
8328 * function will often be for related code points that are near each other.
8329 * And benchmarks show that caching gives better results. We also test
8330 * here if the code point is within the bounds of the list. These tests
8331 * replace others that would have had to be made anyway to make sure that
8332 * the array bounds were not exceeded, and these give us extra information
8333 * at the same time */
8334 if (cp >= array[mid]) {
8335 if (cp >= array[highest_element]) {
8336 return highest_element;
8339 /* Here, array[mid] <= cp < array[highest_element]. This means that
8340 * the final element is not the answer, so can exclude it; it also
8341 * means that <mid> is not the final element, so can refer to 'mid + 1'
8343 if (cp < array[mid + 1]) {
8349 else { /* cp < aray[mid] */
8350 if (cp < array[0]) { /* Fail if outside the array */
8354 if (cp >= array[mid - 1]) {
8359 /* Binary search. What we are looking for is <i> such that
8360 * array[i] <= cp < array[i+1]
8361 * The loop below converges on the i+1. Note that there may not be an
8362 * (i+1)th element in the array, and things work nonetheless */
8363 while (low < high) {
8364 mid = (low + high) / 2;
8365 assert(mid <= highest_element);
8366 if (array[mid] <= cp) { /* cp >= array[mid] */
8369 /* We could do this extra test to exit the loop early.
8370 if (cp < array[low]) {
8375 else { /* cp < array[mid] */
8382 invlist_set_previous_index(invlist, high);
8387 Perl__invlist_populate_swatch(SV* const invlist,
8388 const UV start, const UV end, U8* swatch)
8390 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8391 * but is used when the swash has an inversion list. This makes this much
8392 * faster, as it uses a binary search instead of a linear one. This is
8393 * intimately tied to that function, and perhaps should be in utf8.c,
8394 * except it is intimately tied to inversion lists as well. It assumes
8395 * that <swatch> is all 0's on input */
8398 const IV len = _invlist_len(invlist);
8402 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8404 if (len == 0) { /* Empty inversion list */
8408 array = invlist_array(invlist);
8410 /* Find which element it is */
8411 i = _invlist_search(invlist, start);
8413 /* We populate from <start> to <end> */
8414 while (current < end) {
8417 /* The inversion list gives the results for every possible code point
8418 * after the first one in the list. Only those ranges whose index is
8419 * even are ones that the inversion list matches. For the odd ones,
8420 * and if the initial code point is not in the list, we have to skip
8421 * forward to the next element */
8422 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8424 if (i >= len) { /* Finished if beyond the end of the array */
8428 if (current >= end) { /* Finished if beyond the end of what we
8430 if (LIKELY(end < UV_MAX)) {
8434 /* We get here when the upper bound is the maximum
8435 * representable on the machine, and we are looking for just
8436 * that code point. Have to special case it */
8438 goto join_end_of_list;
8441 assert(current >= start);
8443 /* The current range ends one below the next one, except don't go past
8446 upper = (i < len && array[i] < end) ? array[i] : end;
8448 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8449 * for each code point in it */
8450 for (; current < upper; current++) {
8451 const STRLEN offset = (STRLEN)(current - start);
8452 swatch[offset >> 3] |= 1 << (offset & 7);
8457 /* Quit if at the end of the list */
8460 /* But first, have to deal with the highest possible code point on
8461 * the platform. The previous code assumes that <end> is one
8462 * beyond where we want to populate, but that is impossible at the
8463 * platform's infinity, so have to handle it specially */
8464 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8466 const STRLEN offset = (STRLEN)(end - start);
8467 swatch[offset >> 3] |= 1 << (offset & 7);
8472 /* Advance to the next range, which will be for code points not in the
8481 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8482 const bool complement_b, SV** output)
8484 /* Take the union of two inversion lists and point <output> to it. *output
8485 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8486 * the reference count to that list will be decremented if not already a
8487 * temporary (mortal); otherwise *output will be made correspondingly
8488 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8489 * second list is returned. If <complement_b> is TRUE, the union is taken
8490 * of the complement (inversion) of <b> instead of b itself.
8492 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8493 * Richard Gillam, published by Addison-Wesley, and explained at some
8494 * length there. The preface says to incorporate its examples into your
8495 * code at your own risk.
8497 * The algorithm is like a merge sort.
8499 * XXX A potential performance improvement is to keep track as we go along
8500 * if only one of the inputs contributes to the result, meaning the other
8501 * is a subset of that one. In that case, we can skip the final copy and
8502 * return the larger of the input lists, but then outside code might need
8503 * to keep track of whether to free the input list or not */
8505 const UV* array_a; /* a's array */
8507 UV len_a; /* length of a's array */
8510 SV* u; /* the resulting union */
8514 UV i_a = 0; /* current index into a's array */
8518 /* running count, as explained in the algorithm source book; items are
8519 * stopped accumulating and are output when the count changes to/from 0.
8520 * The count is incremented when we start a range that's in the set, and
8521 * decremented when we start a range that's not in the set. So its range
8522 * is 0 to 2. Only when the count is zero is something not in the set.
8526 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8529 /* If either one is empty, the union is the other one */
8530 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8531 bool make_temp = FALSE; /* Should we mortalize the result? */
8535 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8541 *output = invlist_clone(b);
8543 _invlist_invert(*output);
8545 } /* else *output already = b; */
8548 sv_2mortal(*output);
8552 else if ((len_b = _invlist_len(b)) == 0) {
8553 bool make_temp = FALSE;
8555 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8560 /* The complement of an empty list is a list that has everything in it,
8561 * so the union with <a> includes everything too */
8564 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8568 *output = _new_invlist(1);
8569 _append_range_to_invlist(*output, 0, UV_MAX);
8571 else if (*output != a) {
8572 *output = invlist_clone(a);
8574 /* else *output already = a; */
8577 sv_2mortal(*output);
8582 /* Here both lists exist and are non-empty */
8583 array_a = invlist_array(a);
8584 array_b = invlist_array(b);
8586 /* If are to take the union of 'a' with the complement of b, set it
8587 * up so are looking at b's complement. */
8590 /* To complement, we invert: if the first element is 0, remove it. To
8591 * do this, we just pretend the array starts one later */
8592 if (array_b[0] == 0) {
8598 /* But if the first element is not zero, we pretend the list starts
8599 * at the 0 that is always stored immediately before the array. */
8605 /* Size the union for the worst case: that the sets are completely
8607 u = _new_invlist(len_a + len_b);
8609 /* Will contain U+0000 if either component does */
8610 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8611 || (len_b > 0 && array_b[0] == 0));
8613 /* Go through each list item by item, stopping when exhausted one of
8615 while (i_a < len_a && i_b < len_b) {
8616 UV cp; /* The element to potentially add to the union's array */
8617 bool cp_in_set; /* is it in the the input list's set or not */
8619 /* We need to take one or the other of the two inputs for the union.
8620 * Since we are merging two sorted lists, we take the smaller of the
8621 * next items. In case of a tie, we take the one that is in its set
8622 * first. If we took one not in the set first, it would decrement the
8623 * count, possibly to 0 which would cause it to be output as ending the
8624 * range, and the next time through we would take the same number, and
8625 * output it again as beginning the next range. By doing it the
8626 * opposite way, there is no possibility that the count will be
8627 * momentarily decremented to 0, and thus the two adjoining ranges will
8628 * be seamlessly merged. (In a tie and both are in the set or both not
8629 * in the set, it doesn't matter which we take first.) */
8630 if (array_a[i_a] < array_b[i_b]
8631 || (array_a[i_a] == array_b[i_b]
8632 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8634 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8638 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8639 cp = array_b[i_b++];
8642 /* Here, have chosen which of the two inputs to look at. Only output
8643 * if the running count changes to/from 0, which marks the
8644 * beginning/end of a range in that's in the set */
8647 array_u[i_u++] = cp;
8654 array_u[i_u++] = cp;
8659 /* Here, we are finished going through at least one of the lists, which
8660 * means there is something remaining in at most one. We check if the list
8661 * that hasn't been exhausted is positioned such that we are in the middle
8662 * of a range in its set or not. (i_a and i_b point to the element beyond
8663 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8664 * is potentially more to output.
8665 * There are four cases:
8666 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8667 * in the union is entirely from the non-exhausted set.
8668 * 2) Both were in their sets, count is 2. Nothing further should
8669 * be output, as everything that remains will be in the exhausted
8670 * list's set, hence in the union; decrementing to 1 but not 0 insures
8672 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8673 * Nothing further should be output because the union includes
8674 * everything from the exhausted set. Not decrementing ensures that.
8675 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8676 * decrementing to 0 insures that we look at the remainder of the
8677 * non-exhausted set */
8678 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8679 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8684 /* The final length is what we've output so far, plus what else is about to
8685 * be output. (If 'count' is non-zero, then the input list we exhausted
8686 * has everything remaining up to the machine's limit in its set, and hence
8687 * in the union, so there will be no further output. */
8690 /* At most one of the subexpressions will be non-zero */
8691 len_u += (len_a - i_a) + (len_b - i_b);
8694 /* Set result to final length, which can change the pointer to array_u, so
8696 if (len_u != _invlist_len(u)) {
8697 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8699 array_u = invlist_array(u);
8702 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8703 * the other) ended with everything above it not in its set. That means
8704 * that the remaining part of the union is precisely the same as the
8705 * non-exhausted list, so can just copy it unchanged. (If both list were
8706 * exhausted at the same time, then the operations below will be both 0.)
8709 IV copy_count; /* At most one will have a non-zero copy count */
8710 if ((copy_count = len_a - i_a) > 0) {
8711 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8713 else if ((copy_count = len_b - i_b) > 0) {
8714 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8718 /* We may be removing a reference to one of the inputs. If so, the output
8719 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8720 * count decremented) */
8721 if (a == *output || b == *output) {
8722 assert(! invlist_is_iterating(*output));
8723 if ((SvTEMP(*output))) {
8727 SvREFCNT_dec_NN(*output);
8737 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8738 const bool complement_b, SV** i)
8740 /* Take the intersection of two inversion lists and point <i> to it. *i
8741 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8742 * the reference count to that list will be decremented if not already a
8743 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8744 * The first list, <a>, may be NULL, in which case an empty list is
8745 * returned. If <complement_b> is TRUE, the result will be the
8746 * intersection of <a> and the complement (or inversion) of <b> instead of
8749 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8750 * Richard Gillam, published by Addison-Wesley, and explained at some
8751 * length there. The preface says to incorporate its examples into your
8752 * code at your own risk. In fact, it had bugs
8754 * The algorithm is like a merge sort, and is essentially the same as the
8758 const UV* array_a; /* a's array */
8760 UV len_a; /* length of a's array */
8763 SV* r; /* the resulting intersection */
8767 UV i_a = 0; /* current index into a's array */
8771 /* running count, as explained in the algorithm source book; items are
8772 * stopped accumulating and are output when the count changes to/from 2.
8773 * The count is incremented when we start a range that's in the set, and
8774 * decremented when we start a range that's not in the set. So its range
8775 * is 0 to 2. Only when the count is 2 is something in the intersection.
8779 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8782 /* Special case if either one is empty */
8783 len_a = (a == NULL) ? 0 : _invlist_len(a);
8784 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8785 bool make_temp = FALSE;
8787 if (len_a != 0 && complement_b) {
8789 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8790 * be empty. Here, also we are using 'b's complement, which hence
8791 * must be every possible code point. Thus the intersection is
8795 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8800 *i = invlist_clone(a);
8802 /* else *i is already 'a' */
8810 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8811 * intersection must be empty */
8813 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8818 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8822 *i = _new_invlist(0);
8830 /* Here both lists exist and are non-empty */
8831 array_a = invlist_array(a);
8832 array_b = invlist_array(b);
8834 /* If are to take the intersection of 'a' with the complement of b, set it
8835 * up so are looking at b's complement. */
8838 /* To complement, we invert: if the first element is 0, remove it. To
8839 * do this, we just pretend the array starts one later */
8840 if (array_b[0] == 0) {
8846 /* But if the first element is not zero, we pretend the list starts
8847 * at the 0 that is always stored immediately before the array. */
8853 /* Size the intersection for the worst case: that the intersection ends up
8854 * fragmenting everything to be completely disjoint */
8855 r= _new_invlist(len_a + len_b);
8857 /* Will contain U+0000 iff both components do */
8858 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8859 && len_b > 0 && array_b[0] == 0);
8861 /* Go through each list item by item, stopping when exhausted one of
8863 while (i_a < len_a && i_b < len_b) {
8864 UV cp; /* The element to potentially add to the intersection's
8866 bool cp_in_set; /* Is it in the input list's set or not */
8868 /* We need to take one or the other of the two inputs for the
8869 * intersection. Since we are merging two sorted lists, we take the
8870 * smaller of the next items. In case of a tie, we take the one that
8871 * is not in its set first (a difference from the union algorithm). If
8872 * we took one in the set first, it would increment the count, possibly
8873 * to 2 which would cause it to be output as starting a range in the
8874 * intersection, and the next time through we would take that same
8875 * number, and output it again as ending the set. By doing it the
8876 * opposite of this, there is no possibility that the count will be
8877 * momentarily incremented to 2. (In a tie and both are in the set or
8878 * both not in the set, it doesn't matter which we take first.) */
8879 if (array_a[i_a] < array_b[i_b]
8880 || (array_a[i_a] == array_b[i_b]
8881 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8883 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8887 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8891 /* Here, have chosen which of the two inputs to look at. Only output
8892 * if the running count changes to/from 2, which marks the
8893 * beginning/end of a range that's in the intersection */
8897 array_r[i_r++] = cp;
8902 array_r[i_r++] = cp;
8908 /* Here, we are finished going through at least one of the lists, which
8909 * means there is something remaining in at most one. We check if the list
8910 * that has been exhausted is positioned such that we are in the middle
8911 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8912 * the ones we care about.) There are four cases:
8913 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8914 * nothing left in the intersection.
8915 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8916 * above 2. What should be output is exactly that which is in the
8917 * non-exhausted set, as everything it has is also in the intersection
8918 * set, and everything it doesn't have can't be in the intersection
8919 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8920 * gets incremented to 2. Like the previous case, the intersection is
8921 * everything that remains in the non-exhausted set.
8922 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8923 * remains 1. And the intersection has nothing more. */
8924 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8925 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8930 /* The final length is what we've output so far plus what else is in the
8931 * intersection. At most one of the subexpressions below will be non-zero
8935 len_r += (len_a - i_a) + (len_b - i_b);
8938 /* Set result to final length, which can change the pointer to array_r, so
8940 if (len_r != _invlist_len(r)) {
8941 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8943 array_r = invlist_array(r);
8946 /* Finish outputting any remaining */
8947 if (count >= 2) { /* At most one will have a non-zero copy count */
8949 if ((copy_count = len_a - i_a) > 0) {
8950 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8952 else if ((copy_count = len_b - i_b) > 0) {
8953 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8957 /* We may be removing a reference to one of the inputs. If so, the output
8958 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8959 * count decremented) */
8960 if (a == *i || b == *i) {
8961 assert(! invlist_is_iterating(*i));
8966 SvREFCNT_dec_NN(*i);
8976 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8978 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8979 * set. A pointer to the inversion list is returned. This may actually be
8980 * a new list, in which case the passed in one has been destroyed. The
8981 * passed-in inversion list can be NULL, in which case a new one is created
8982 * with just the one range in it */
8987 if (invlist == NULL) {
8988 invlist = _new_invlist(2);
8992 len = _invlist_len(invlist);
8995 /* If comes after the final entry actually in the list, can just append it
8998 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8999 && start >= invlist_array(invlist)[len - 1]))
9001 _append_range_to_invlist(invlist, start, end);
9005 /* Here, can't just append things, create and return a new inversion list
9006 * which is the union of this range and the existing inversion list */
9007 range_invlist = _new_invlist(2);
9008 _append_range_to_invlist(range_invlist, start, end);
9010 _invlist_union(invlist, range_invlist, &invlist);
9012 /* The temporary can be freed */
9013 SvREFCNT_dec_NN(range_invlist);
9019 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
9020 UV** other_elements_ptr)
9022 /* Create and return an inversion list whose contents are to be populated
9023 * by the caller. The caller gives the number of elements (in 'size') and
9024 * the very first element ('element0'). This function will set
9025 * '*other_elements_ptr' to an array of UVs, where the remaining elements
9028 * Obviously there is some trust involved that the caller will properly
9029 * fill in the other elements of the array.
9031 * (The first element needs to be passed in, as the underlying code does
9032 * things differently depending on whether it is zero or non-zero) */
9034 SV* invlist = _new_invlist(size);
9037 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
9039 _append_range_to_invlist(invlist, element0, element0);
9040 offset = *get_invlist_offset_addr(invlist);
9042 invlist_set_len(invlist, size, offset);
9043 *other_elements_ptr = invlist_array(invlist) + 1;
9049 PERL_STATIC_INLINE SV*
9050 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
9051 return _add_range_to_invlist(invlist, cp, cp);
9054 #ifndef PERL_IN_XSUB_RE
9056 Perl__invlist_invert(pTHX_ SV* const invlist)
9058 /* Complement the input inversion list. This adds a 0 if the list didn't
9059 * have a zero; removes it otherwise. As described above, the data
9060 * structure is set up so that this is very efficient */
9062 PERL_ARGS_ASSERT__INVLIST_INVERT;
9064 assert(! invlist_is_iterating(invlist));
9066 /* The inverse of matching nothing is matching everything */
9067 if (_invlist_len(invlist) == 0) {
9068 _append_range_to_invlist(invlist, 0, UV_MAX);
9072 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
9077 PERL_STATIC_INLINE SV*
9078 S_invlist_clone(pTHX_ SV* const invlist)
9081 /* Return a new inversion list that is a copy of the input one, which is
9082 * unchanged. The new list will not be mortal even if the old one was. */
9084 /* Need to allocate extra space to accommodate Perl's addition of a
9085 * trailing NUL to SvPV's, since it thinks they are always strings */
9086 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
9087 STRLEN physical_length = SvCUR(invlist);
9088 bool offset = *(get_invlist_offset_addr(invlist));
9090 PERL_ARGS_ASSERT_INVLIST_CLONE;
9092 *(get_invlist_offset_addr(new_invlist)) = offset;
9093 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
9094 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
9099 PERL_STATIC_INLINE STRLEN*
9100 S_get_invlist_iter_addr(SV* invlist)
9102 /* Return the address of the UV that contains the current iteration
9105 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
9107 assert(SvTYPE(invlist) == SVt_INVLIST);
9109 return &(((XINVLIST*) SvANY(invlist))->iterator);
9112 PERL_STATIC_INLINE void
9113 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
9115 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
9117 *get_invlist_iter_addr(invlist) = 0;
9120 PERL_STATIC_INLINE void
9121 S_invlist_iterfinish(SV* invlist)
9123 /* Terminate iterator for invlist. This is to catch development errors.
9124 * Any iteration that is interrupted before completed should call this
9125 * function. Functions that add code points anywhere else but to the end
9126 * of an inversion list assert that they are not in the middle of an
9127 * iteration. If they were, the addition would make the iteration
9128 * problematical: if the iteration hadn't reached the place where things
9129 * were being added, it would be ok */
9131 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9133 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9137 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9139 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9140 * This call sets in <*start> and <*end>, the next range in <invlist>.
9141 * Returns <TRUE> if successful and the next call will return the next
9142 * range; <FALSE> if was already at the end of the list. If the latter,
9143 * <*start> and <*end> are unchanged, and the next call to this function
9144 * will start over at the beginning of the list */
9146 STRLEN* pos = get_invlist_iter_addr(invlist);
9147 UV len = _invlist_len(invlist);
9150 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9153 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9157 array = invlist_array(invlist);
9159 *start = array[(*pos)++];
9165 *end = array[(*pos)++] - 1;
9171 PERL_STATIC_INLINE UV
9172 S_invlist_highest(SV* const invlist)
9174 /* Returns the highest code point that matches an inversion list. This API
9175 * has an ambiguity, as it returns 0 under either the highest is actually
9176 * 0, or if the list is empty. If this distinction matters to you, check
9177 * for emptiness before calling this function */
9179 UV len = _invlist_len(invlist);
9182 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9188 array = invlist_array(invlist);
9190 /* The last element in the array in the inversion list always starts a
9191 * range that goes to infinity. That range may be for code points that are
9192 * matched in the inversion list, or it may be for ones that aren't
9193 * matched. In the latter case, the highest code point in the set is one
9194 * less than the beginning of this range; otherwise it is the final element
9195 * of this range: infinity */
9196 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9198 : array[len - 1] - 1;
9201 #ifndef PERL_IN_XSUB_RE
9203 Perl__invlist_contents(pTHX_ SV* const invlist)
9205 /* Get the contents of an inversion list into a string SV so that they can
9206 * be printed out. It uses the format traditionally done for debug tracing
9210 SV* output = newSVpvs("\n");
9212 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9214 assert(! invlist_is_iterating(invlist));
9216 invlist_iterinit(invlist);
9217 while (invlist_iternext(invlist, &start, &end)) {
9218 if (end == UV_MAX) {
9219 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9221 else if (end != start) {
9222 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9226 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9234 #ifndef PERL_IN_XSUB_RE
9236 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9237 const char * const indent, SV* const invlist)
9239 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9240 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9241 * the string 'indent'. The output looks like this:
9242 [0] 0x000A .. 0x000D
9244 [4] 0x2028 .. 0x2029
9245 [6] 0x3104 .. INFINITY
9246 * This means that the first range of code points matched by the list are
9247 * 0xA through 0xD; the second range contains only the single code point
9248 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9249 * are used to define each range (except if the final range extends to
9250 * infinity, only a single element is needed). The array index of the
9251 * first element for the corresponding range is given in brackets. */
9256 PERL_ARGS_ASSERT__INVLIST_DUMP;
9258 if (invlist_is_iterating(invlist)) {
9259 Perl_dump_indent(aTHX_ level, file,
9260 "%sCan't dump inversion list because is in middle of iterating\n",
9265 invlist_iterinit(invlist);
9266 while (invlist_iternext(invlist, &start, &end)) {
9267 if (end == UV_MAX) {
9268 Perl_dump_indent(aTHX_ level, file,
9269 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9270 indent, (UV)count, start);
9272 else if (end != start) {
9273 Perl_dump_indent(aTHX_ level, file,
9274 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9275 indent, (UV)count, start, end);
9278 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9279 indent, (UV)count, start);
9286 Perl__load_PL_utf8_foldclosures (pTHX)
9288 assert(! PL_utf8_foldclosures);
9290 /* If the folds haven't been read in, call a fold function
9292 if (! PL_utf8_tofold) {
9293 U8 dummy[UTF8_MAXBYTES_CASE+1];
9295 /* This string is just a short named one above \xff */
9296 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9297 assert(PL_utf8_tofold); /* Verify that worked */
9299 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9303 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9305 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9307 /* Return a boolean as to if the two passed in inversion lists are
9308 * identical. The final argument, if TRUE, says to take the complement of
9309 * the second inversion list before doing the comparison */
9311 const UV* array_a = invlist_array(a);
9312 const UV* array_b = invlist_array(b);
9313 UV len_a = _invlist_len(a);
9314 UV len_b = _invlist_len(b);
9316 UV i = 0; /* current index into the arrays */
9317 bool retval = TRUE; /* Assume are identical until proven otherwise */
9319 PERL_ARGS_ASSERT__INVLISTEQ;
9321 /* If are to compare 'a' with the complement of b, set it
9322 * up so are looking at b's complement. */
9325 /* The complement of nothing is everything, so <a> would have to have
9326 * just one element, starting at zero (ending at infinity) */
9328 return (len_a == 1 && array_a[0] == 0);
9330 else if (array_b[0] == 0) {
9332 /* Otherwise, to complement, we invert. Here, the first element is
9333 * 0, just remove it. To do this, we just pretend the array starts
9341 /* But if the first element is not zero, we pretend the list starts
9342 * at the 0 that is always stored immediately before the array. */
9348 /* Make sure that the lengths are the same, as well as the final element
9349 * before looping through the remainder. (Thus we test the length, final,
9350 * and first elements right off the bat) */
9351 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9354 else for (i = 0; i < len_a - 1; i++) {
9355 if (array_a[i] != array_b[i]) {
9366 * As best we can, determine the characters that can match the start of
9367 * the given EXACTF-ish node.
9369 * Returns the invlist as a new SV*; it is the caller's responsibility to
9370 * call SvREFCNT_dec() when done with it.
9373 S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
9375 const U8 * s = (U8*)STRING(node);
9376 SSize_t bytelen = STR_LEN(node);
9378 /* Start out big enough for 2 separate code points */
9379 SV* invlist = _new_invlist(4);
9381 PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
9386 /* We punt and assume can match anything if the node begins
9387 * with a multi-character fold. Things are complicated. For
9388 * example, /ffi/i could match any of:
9389 * "\N{LATIN SMALL LIGATURE FFI}"
9390 * "\N{LATIN SMALL LIGATURE FF}I"
9391 * "F\N{LATIN SMALL LIGATURE FI}"
9392 * plus several other things; and making sure we have all the
9393 * possibilities is hard. */
9394 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
9395 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9398 /* Any Latin1 range character can potentially match any
9399 * other depending on the locale */
9400 if (OP(node) == EXACTFL) {
9401 _invlist_union(invlist, PL_Latin1, &invlist);
9404 /* But otherwise, it matches at least itself. We can
9405 * quickly tell if it has a distinct fold, and if so,
9406 * it matches that as well */
9407 invlist = add_cp_to_invlist(invlist, uc);
9408 if (IS_IN_SOME_FOLD_L1(uc))
9409 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
9412 /* Some characters match above-Latin1 ones under /i. This
9413 * is true of EXACTFL ones when the locale is UTF-8 */
9414 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
9415 && (! isASCII(uc) || (OP(node) != EXACTFA
9416 && OP(node) != EXACTFA_NO_TRIE)))
9418 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
9422 else { /* Pattern is UTF-8 */
9423 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
9424 STRLEN foldlen = UTF8SKIP(s);
9425 const U8* e = s + bytelen;
9428 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
9430 /* The only code points that aren't folded in a UTF EXACTFish
9431 * node are are the problematic ones in EXACTFL nodes */
9432 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
9433 /* We need to check for the possibility that this EXACTFL
9434 * node begins with a multi-char fold. Therefore we fold
9435 * the first few characters of it so that we can make that
9440 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
9442 *(d++) = (U8) toFOLD(*s);
9447 to_utf8_fold(s, d, &len);
9453 /* And set up so the code below that looks in this folded
9454 * buffer instead of the node's string */
9456 foldlen = UTF8SKIP(folded);
9460 /* When we reach here 's' points to the fold of the first
9461 * character(s) of the node; and 'e' points to far enough along
9462 * the folded string to be just past any possible multi-char
9463 * fold. 'foldlen' is the length in bytes of the first
9466 * Unlike the non-UTF-8 case, the macro for determining if a
9467 * string is a multi-char fold requires all the characters to
9468 * already be folded. This is because of all the complications
9469 * if not. Note that they are folded anyway, except in EXACTFL
9470 * nodes. Like the non-UTF case above, we punt if the node
9471 * begins with a multi-char fold */
9473 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
9474 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
9476 else { /* Single char fold */
9478 /* It matches all the things that fold to it, which are
9479 * found in PL_utf8_foldclosures (including itself) */
9480 invlist = add_cp_to_invlist(invlist, uc);
9481 if (! PL_utf8_foldclosures)
9482 _load_PL_utf8_foldclosures();
9483 if ((listp = hv_fetch(PL_utf8_foldclosures,
9484 (char *) s, foldlen, FALSE)))
9486 AV* list = (AV*) *listp;
9488 for (k = 0; k <= av_tindex(list); k++) {
9489 SV** c_p = av_fetch(list, k, FALSE);
9495 /* /aa doesn't allow folds between ASCII and non- */
9496 if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
9497 && isASCII(c) != isASCII(uc))
9502 invlist = add_cp_to_invlist(invlist, c);
9511 #undef HEADER_LENGTH
9512 #undef TO_INTERNAL_SIZE
9513 #undef FROM_INTERNAL_SIZE
9514 #undef INVLIST_VERSION_ID
9516 /* End of inversion list object */
9519 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9521 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9522 * constructs, and updates RExC_flags with them. On input, RExC_parse
9523 * should point to the first flag; it is updated on output to point to the
9524 * final ')' or ':'. There needs to be at least one flag, or this will
9527 /* for (?g), (?gc), and (?o) warnings; warning
9528 about (?c) will warn about (?g) -- japhy */
9530 #define WASTED_O 0x01
9531 #define WASTED_G 0x02
9532 #define WASTED_C 0x04
9533 #define WASTED_GC (WASTED_G|WASTED_C)
9534 I32 wastedflags = 0x00;
9535 U32 posflags = 0, negflags = 0;
9536 U32 *flagsp = &posflags;
9537 char has_charset_modifier = '\0';
9539 bool has_use_defaults = FALSE;
9540 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9541 int x_mod_count = 0;
9543 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9545 /* '^' as an initial flag sets certain defaults */
9546 if (UCHARAT(RExC_parse) == '^') {
9548 has_use_defaults = TRUE;
9549 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9550 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9551 ? REGEX_UNICODE_CHARSET
9552 : REGEX_DEPENDS_CHARSET);
9555 cs = get_regex_charset(RExC_flags);
9556 if (cs == REGEX_DEPENDS_CHARSET
9557 && (RExC_utf8 || RExC_uni_semantics))
9559 cs = REGEX_UNICODE_CHARSET;
9562 while (*RExC_parse) {
9563 /* && strchr("iogcmsx", *RExC_parse) */
9564 /* (?g), (?gc) and (?o) are useless here
9565 and must be globally applied -- japhy */
9566 switch (*RExC_parse) {
9568 /* Code for the imsxn flags */
9569 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
9571 case LOCALE_PAT_MOD:
9572 if (has_charset_modifier) {
9573 goto excess_modifier;
9575 else if (flagsp == &negflags) {
9578 cs = REGEX_LOCALE_CHARSET;
9579 has_charset_modifier = LOCALE_PAT_MOD;
9581 case UNICODE_PAT_MOD:
9582 if (has_charset_modifier) {
9583 goto excess_modifier;
9585 else if (flagsp == &negflags) {
9588 cs = REGEX_UNICODE_CHARSET;
9589 has_charset_modifier = UNICODE_PAT_MOD;
9591 case ASCII_RESTRICT_PAT_MOD:
9592 if (flagsp == &negflags) {
9595 if (has_charset_modifier) {
9596 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9597 goto excess_modifier;
9599 /* Doubled modifier implies more restricted */
9600 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9603 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9605 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9607 case DEPENDS_PAT_MOD:
9608 if (has_use_defaults) {
9609 goto fail_modifiers;
9611 else if (flagsp == &negflags) {
9614 else if (has_charset_modifier) {
9615 goto excess_modifier;
9618 /* The dual charset means unicode semantics if the
9619 * pattern (or target, not known until runtime) are
9620 * utf8, or something in the pattern indicates unicode
9622 cs = (RExC_utf8 || RExC_uni_semantics)
9623 ? REGEX_UNICODE_CHARSET
9624 : REGEX_DEPENDS_CHARSET;
9625 has_charset_modifier = DEPENDS_PAT_MOD;
9629 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9630 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9632 else if (has_charset_modifier == *(RExC_parse - 1)) {
9633 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9637 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9639 NOT_REACHED; /*NOTREACHED*/
9642 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9644 NOT_REACHED; /*NOTREACHED*/
9645 case ONCE_PAT_MOD: /* 'o' */
9646 case GLOBAL_PAT_MOD: /* 'g' */
9647 if (PASS2 && ckWARN(WARN_REGEXP)) {
9648 const I32 wflagbit = *RExC_parse == 'o'
9651 if (! (wastedflags & wflagbit) ) {
9652 wastedflags |= wflagbit;
9653 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9656 "Useless (%s%c) - %suse /%c modifier",
9657 flagsp == &negflags ? "?-" : "?",
9659 flagsp == &negflags ? "don't " : "",
9666 case CONTINUE_PAT_MOD: /* 'c' */
9667 if (PASS2 && ckWARN(WARN_REGEXP)) {
9668 if (! (wastedflags & WASTED_C) ) {
9669 wastedflags |= WASTED_GC;
9670 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9673 "Useless (%sc) - %suse /gc modifier",
9674 flagsp == &negflags ? "?-" : "?",
9675 flagsp == &negflags ? "don't " : ""
9680 case KEEPCOPY_PAT_MOD: /* 'p' */
9681 if (flagsp == &negflags) {
9683 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9685 *flagsp |= RXf_PMf_KEEPCOPY;
9689 /* A flag is a default iff it is following a minus, so
9690 * if there is a minus, it means will be trying to
9691 * re-specify a default which is an error */
9692 if (has_use_defaults || flagsp == &negflags) {
9693 goto fail_modifiers;
9696 wastedflags = 0; /* reset so (?g-c) warns twice */
9700 RExC_flags |= posflags;
9701 RExC_flags &= ~negflags;
9702 set_regex_charset(&RExC_flags, cs);
9703 if (RExC_flags & RXf_PMf_FOLD) {
9704 RExC_contains_i = 1;
9707 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9713 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9714 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9715 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9716 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9717 NOT_REACHED; /*NOTREACHED*/
9724 STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
9729 - reg - regular expression, i.e. main body or parenthesized thing
9731 * Caller must absorb opening parenthesis.
9733 * Combining parenthesis handling with the base level of regular expression
9734 * is a trifle forced, but the need to tie the tails of the branches to what
9735 * follows makes it hard to avoid.
9737 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9739 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9741 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9744 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9745 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9746 needs to be restarted.
9747 Otherwise would only return NULL if regbranch() returns NULL, which
9750 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9751 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9752 * 2 is like 1, but indicates that nextchar() has been called to advance
9753 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9754 * this flag alerts us to the need to check for that */
9756 regnode *ret; /* Will be the head of the group. */
9759 regnode *ender = NULL;
9762 U32 oregflags = RExC_flags;
9763 bool have_branch = 0;
9765 I32 freeze_paren = 0;
9766 I32 after_freeze = 0;
9767 I32 num; /* numeric backreferences */
9769 char * parse_start = RExC_parse; /* MJD */
9770 char * const oregcomp_parse = RExC_parse;
9772 GET_RE_DEBUG_FLAGS_DECL;
9774 PERL_ARGS_ASSERT_REG;
9775 DEBUG_PARSE("reg ");
9777 *flagp = 0; /* Tentatively. */
9780 /* Make an OPEN node, if parenthesized. */
9783 /* Under /x, space and comments can be gobbled up between the '(' and
9784 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9785 * intervening space, as the sequence is a token, and a token should be
9787 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9789 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9790 char *start_verb = RExC_parse;
9791 STRLEN verb_len = 0;
9792 char *start_arg = NULL;
9793 unsigned char op = 0;
9795 int internal_argval = 0; /* internal_argval is only useful if
9798 if (has_intervening_patws) {
9800 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9802 while ( *RExC_parse && *RExC_parse != ')' ) {
9803 if ( *RExC_parse == ':' ) {
9804 start_arg = RExC_parse + 1;
9810 verb_len = RExC_parse - start_verb;
9813 while ( *RExC_parse && *RExC_parse != ')' )
9815 if ( *RExC_parse != ')' )
9816 vFAIL("Unterminated verb pattern argument");
9817 if ( RExC_parse == start_arg )
9820 if ( *RExC_parse != ')' )
9821 vFAIL("Unterminated verb pattern");
9824 switch ( *start_verb ) {
9825 case 'A': /* (*ACCEPT) */
9826 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9828 internal_argval = RExC_nestroot;
9831 case 'C': /* (*COMMIT) */
9832 if ( memEQs(start_verb,verb_len,"COMMIT") )
9835 case 'F': /* (*FAIL) */
9836 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9841 case ':': /* (*:NAME) */
9842 case 'M': /* (*MARK:NAME) */
9843 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9848 case 'P': /* (*PRUNE) */
9849 if ( memEQs(start_verb,verb_len,"PRUNE") )
9852 case 'S': /* (*SKIP) */
9853 if ( memEQs(start_verb,verb_len,"SKIP") )
9856 case 'T': /* (*THEN) */
9857 /* [19:06] <TimToady> :: is then */
9858 if ( memEQs(start_verb,verb_len,"THEN") ) {
9860 RExC_seen |= REG_CUTGROUP_SEEN;
9865 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9867 "Unknown verb pattern '%"UTF8f"'",
9868 UTF8fARG(UTF, verb_len, start_verb));
9871 if ( start_arg && internal_argval ) {
9872 vFAIL3("Verb pattern '%.*s' may not have an argument",
9873 verb_len, start_verb);
9874 } else if ( argok < 0 && !start_arg ) {
9875 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9876 verb_len, start_verb);
9878 ret = reganode(pRExC_state, op, internal_argval);
9879 if ( ! internal_argval && ! SIZE_ONLY ) {
9881 SV *sv = newSVpvn( start_arg,
9882 RExC_parse - start_arg);
9883 ARG(ret) = add_data( pRExC_state,
9885 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9892 if (!internal_argval)
9893 RExC_seen |= REG_VERBARG_SEEN;
9894 } else if ( start_arg ) {
9895 vFAIL3("Verb pattern '%.*s' may not have an argument",
9896 verb_len, start_verb);
9898 ret = reg_node(pRExC_state, op);
9900 nextchar(pRExC_state);
9903 else if (*RExC_parse == '?') { /* (?...) */
9904 bool is_logical = 0;
9905 const char * const seqstart = RExC_parse;
9906 const char * endptr;
9907 if (has_intervening_patws) {
9909 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9913 paren = *RExC_parse++;
9914 ret = NULL; /* For look-ahead/behind. */
9917 case 'P': /* (?P...) variants for those used to PCRE/Python */
9918 paren = *RExC_parse++;
9919 if ( paren == '<') /* (?P<...>) named capture */
9921 else if (paren == '>') { /* (?P>name) named recursion */
9922 goto named_recursion;
9924 else if (paren == '=') { /* (?P=...) named backref */
9925 /* this pretty much dupes the code for \k<NAME> in
9926 * regatom(), if you change this make sure you change that
9928 char* name_start = RExC_parse;
9930 SV *sv_dat = reg_scan_name(pRExC_state,
9931 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9932 if (RExC_parse == name_start || *RExC_parse != ')')
9933 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9934 vFAIL2("Sequence %.3s... not terminated",parse_start);
9937 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9938 RExC_rxi->data->data[num]=(void*)sv_dat;
9939 SvREFCNT_inc_simple_void(sv_dat);
9942 ret = reganode(pRExC_state,
9945 : (ASCII_FOLD_RESTRICTED)
9947 : (AT_LEAST_UNI_SEMANTICS)
9955 Set_Node_Offset(ret, parse_start+1);
9956 Set_Node_Cur_Length(ret, parse_start);
9958 nextchar(pRExC_state);
9962 RExC_parse += SKIP_IF_CHAR(RExC_parse);
9963 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9964 vFAIL3("Sequence (%.*s...) not recognized",
9965 RExC_parse-seqstart, seqstart);
9966 NOT_REACHED; /*NOTREACHED*/
9967 case '<': /* (?<...) */
9968 if (*RExC_parse == '!')
9970 else if (*RExC_parse != '=')
9976 case '\'': /* (?'...') */
9977 name_start= RExC_parse;
9978 svname = reg_scan_name(pRExC_state,
9979 SIZE_ONLY /* reverse test from the others */
9980 ? REG_RSN_RETURN_NAME
9981 : REG_RSN_RETURN_NULL);
9982 if (RExC_parse == name_start || *RExC_parse != paren)
9983 vFAIL2("Sequence (?%c... not terminated",
9984 paren=='>' ? '<' : paren);
9988 if (!svname) /* shouldn't happen */
9990 "panic: reg_scan_name returned NULL");
9991 if (!RExC_paren_names) {
9992 RExC_paren_names= newHV();
9993 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9995 RExC_paren_name_list= newAV();
9996 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9999 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
10001 sv_dat = HeVAL(he_str);
10003 /* croak baby croak */
10005 "panic: paren_name hash element allocation failed");
10006 } else if ( SvPOK(sv_dat) ) {
10007 /* (?|...) can mean we have dupes so scan to check
10008 its already been stored. Maybe a flag indicating
10009 we are inside such a construct would be useful,
10010 but the arrays are likely to be quite small, so
10011 for now we punt -- dmq */
10012 IV count = SvIV(sv_dat);
10013 I32 *pv = (I32*)SvPVX(sv_dat);
10015 for ( i = 0 ; i < count ; i++ ) {
10016 if ( pv[i] == RExC_npar ) {
10022 pv = (I32*)SvGROW(sv_dat,
10023 SvCUR(sv_dat) + sizeof(I32)+1);
10024 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
10025 pv[count] = RExC_npar;
10026 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
10029 (void)SvUPGRADE(sv_dat,SVt_PVNV);
10030 sv_setpvn(sv_dat, (char *)&(RExC_npar),
10033 SvIV_set(sv_dat, 1);
10036 /* Yes this does cause a memory leak in debugging Perls
10038 if (!av_store(RExC_paren_name_list,
10039 RExC_npar, SvREFCNT_inc(svname)))
10040 SvREFCNT_dec_NN(svname);
10043 /*sv_dump(sv_dat);*/
10045 nextchar(pRExC_state);
10047 goto capturing_parens;
10049 RExC_seen |= REG_LOOKBEHIND_SEEN;
10050 RExC_in_lookbehind++;
10053 case '=': /* (?=...) */
10054 RExC_seen_zerolen++;
10056 case '!': /* (?!...) */
10057 RExC_seen_zerolen++;
10058 /* check if we're really just a "FAIL" assertion */
10060 nextchar(pRExC_state);
10061 if (*RExC_parse == ')') {
10062 ret=reg_node(pRExC_state, OPFAIL);
10063 nextchar(pRExC_state);
10067 case '|': /* (?|...) */
10068 /* branch reset, behave like a (?:...) except that
10069 buffers in alternations share the same numbers */
10071 after_freeze = freeze_paren = RExC_npar;
10073 case ':': /* (?:...) */
10074 case '>': /* (?>...) */
10076 case '$': /* (?$...) */
10077 case '@': /* (?@...) */
10078 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
10080 case '0' : /* (?0) */
10081 case 'R' : /* (?R) */
10082 if (*RExC_parse != ')')
10083 FAIL("Sequence (?R) not terminated");
10084 ret = reg_node(pRExC_state, GOSTART);
10085 RExC_seen |= REG_GOSTART_SEEN;
10086 *flagp |= POSTPONED;
10087 nextchar(pRExC_state);
10090 /* named and numeric backreferences */
10091 case '&': /* (?&NAME) */
10092 parse_start = RExC_parse - 1;
10095 SV *sv_dat = reg_scan_name(pRExC_state,
10096 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10097 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10099 if (RExC_parse == RExC_end || *RExC_parse != ')')
10100 vFAIL("Sequence (?&... not terminated");
10101 goto gen_recurse_regop;
10104 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10106 vFAIL("Illegal pattern");
10108 goto parse_recursion;
10110 case '-': /* (?-1) */
10111 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
10112 RExC_parse--; /* rewind to let it be handled later */
10116 case '1': case '2': case '3': case '4': /* (?1) */
10117 case '5': case '6': case '7': case '8': case '9':
10121 bool is_neg = FALSE;
10123 parse_start = RExC_parse - 1; /* MJD */
10124 if (*RExC_parse == '-') {
10128 if (grok_atoUV(RExC_parse, &unum, &endptr)
10132 RExC_parse = (char*)endptr;
10136 /* Some limit for num? */
10140 if (*RExC_parse!=')')
10141 vFAIL("Expecting close bracket");
10144 if ( paren == '-' ) {
10146 Diagram of capture buffer numbering.
10147 Top line is the normal capture buffer numbers
10148 Bottom line is the negative indexing as from
10152 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
10156 num = RExC_npar + num;
10159 vFAIL("Reference to nonexistent group");
10161 } else if ( paren == '+' ) {
10162 num = RExC_npar + num - 1;
10165 ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
10167 if (num > (I32)RExC_rx->nparens) {
10169 vFAIL("Reference to nonexistent group");
10171 RExC_recurse_count++;
10172 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10173 "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
10174 22, "| |", (int)(depth * 2 + 1), "",
10175 (UV)ARG(ret), (IV)ARG2L(ret)));
10177 RExC_seen |= REG_RECURSE_SEEN;
10178 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
10179 Set_Node_Offset(ret, parse_start); /* MJD */
10181 *flagp |= POSTPONED;
10182 nextchar(pRExC_state);
10187 case '?': /* (??...) */
10189 if (*RExC_parse != '{') {
10190 RExC_parse += SKIP_IF_CHAR(RExC_parse);
10191 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
10193 "Sequence (%"UTF8f"...) not recognized",
10194 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
10195 NOT_REACHED; /*NOTREACHED*/
10197 *flagp |= POSTPONED;
10198 paren = *RExC_parse++;
10200 case '{': /* (?{...}) */
10203 struct reg_code_block *cb;
10205 RExC_seen_zerolen++;
10207 if ( !pRExC_state->num_code_blocks
10208 || pRExC_state->code_index >= pRExC_state->num_code_blocks
10209 || pRExC_state->code_blocks[pRExC_state->code_index].start
10210 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
10213 if (RExC_pm_flags & PMf_USE_RE_EVAL)
10214 FAIL("panic: Sequence (?{...}): no code block found\n");
10215 FAIL("Eval-group not allowed at runtime, use re 'eval'");
10217 /* this is a pre-compiled code block (?{...}) */
10218 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
10219 RExC_parse = RExC_start + cb->end;
10222 if (cb->src_regex) {
10223 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
10224 RExC_rxi->data->data[n] =
10225 (void*)SvREFCNT_inc((SV*)cb->src_regex);
10226 RExC_rxi->data->data[n+1] = (void*)o;
10229 n = add_data(pRExC_state,
10230 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
10231 RExC_rxi->data->data[n] = (void*)o;
10234 pRExC_state->code_index++;
10235 nextchar(pRExC_state);
10239 ret = reg_node(pRExC_state, LOGICAL);
10241 eval = reg2Lanode(pRExC_state, EVAL,
10244 /* for later propagation into (??{})
10246 RExC_flags & RXf_PMf_COMPILETIME
10251 REGTAIL(pRExC_state, ret, eval);
10252 /* deal with the length of this later - MJD */
10255 ret = reg2Lanode(pRExC_state, EVAL, n, 0);
10256 Set_Node_Length(ret, RExC_parse - parse_start + 1);
10257 Set_Node_Offset(ret, parse_start);
10260 case '(': /* (?(?{...})...) and (?(?=...)...) */
10263 const int DEFINE_len = sizeof("DEFINE") - 1;
10264 if (RExC_parse[0] == '?') { /* (?(?...)) */
10265 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
10266 || RExC_parse[1] == '<'
10267 || RExC_parse[1] == '{') { /* Lookahead or eval. */
10271 ret = reg_node(pRExC_state, LOGICAL);
10275 tail = reg(pRExC_state, 1, &flag, depth+1);
10276 if (flag & RESTART_UTF8) {
10277 *flagp = RESTART_UTF8;
10280 REGTAIL(pRExC_state, ret, tail);
10283 /* Fall through to ‘Unknown switch condition’ at the
10284 end of the if/else chain. */
10286 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10287 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10289 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10290 char *name_start= RExC_parse++;
10292 SV *sv_dat=reg_scan_name(pRExC_state,
10293 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10294 if (RExC_parse == name_start || *RExC_parse != ch)
10295 vFAIL2("Sequence (?(%c... not terminated",
10296 (ch == '>' ? '<' : ch));
10299 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10300 RExC_rxi->data->data[num]=(void*)sv_dat;
10301 SvREFCNT_inc_simple_void(sv_dat);
10303 ret = reganode(pRExC_state,NGROUPP,num);
10304 goto insert_if_check_paren;
10306 else if (RExC_end - RExC_parse >= DEFINE_len
10307 && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
10309 ret = reganode(pRExC_state,DEFINEP,0);
10310 RExC_parse += DEFINE_len;
10312 goto insert_if_check_paren;
10314 else if (RExC_parse[0] == 'R') {
10317 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10319 if (grok_atoUV(RExC_parse, &uv, &endptr)
10323 RExC_parse = (char*)endptr;
10325 /* else "Switch condition not recognized" below */
10326 } else if (RExC_parse[0] == '&') {
10329 sv_dat = reg_scan_name(pRExC_state,
10331 ? REG_RSN_RETURN_NULL
10332 : REG_RSN_RETURN_DATA);
10333 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10335 ret = reganode(pRExC_state,INSUBP,parno);
10336 goto insert_if_check_paren;
10338 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10343 if (grok_atoUV(RExC_parse, &uv, &endptr)
10347 RExC_parse = (char*)endptr;
10349 /* XXX else what? */
10350 ret = reganode(pRExC_state, GROUPP, parno);
10352 insert_if_check_paren:
10353 if (*(tmp = nextchar(pRExC_state)) != ')') {
10354 /* nextchar also skips comments, so undo its work
10355 * and skip over the the next character.
10358 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10359 vFAIL("Switch condition not recognized");
10362 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10363 br = regbranch(pRExC_state, &flags, 1,depth+1);
10365 if (flags & RESTART_UTF8) {
10366 *flagp = RESTART_UTF8;
10369 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10372 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10374 c = *nextchar(pRExC_state);
10375 if (flags&HASWIDTH)
10376 *flagp |= HASWIDTH;
10379 vFAIL("(?(DEFINE)....) does not allow branches");
10381 /* Fake one for optimizer. */
10382 lastbr = reganode(pRExC_state, IFTHEN, 0);
10384 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10385 if (flags & RESTART_UTF8) {
10386 *flagp = RESTART_UTF8;
10389 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10392 REGTAIL(pRExC_state, ret, lastbr);
10393 if (flags&HASWIDTH)
10394 *flagp |= HASWIDTH;
10395 c = *nextchar(pRExC_state);
10400 if (RExC_parse>RExC_end)
10401 vFAIL("Switch (?(condition)... not terminated");
10403 vFAIL("Switch (?(condition)... contains too many branches");
10405 ender = reg_node(pRExC_state, TAIL);
10406 REGTAIL(pRExC_state, br, ender);
10408 REGTAIL(pRExC_state, lastbr, ender);
10409 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10412 REGTAIL(pRExC_state, ret, ender);
10413 RExC_size++; /* XXX WHY do we need this?!!
10414 For large programs it seems to be required
10415 but I can't figure out why. -- dmq*/
10418 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10419 vFAIL("Unknown switch condition (?(...))");
10421 case '[': /* (?[ ... ]) */
10422 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10425 RExC_parse--; /* for vFAIL to print correctly */
10426 vFAIL("Sequence (? incomplete");
10428 default: /* e.g., (?i) */
10431 parse_lparen_question_flags(pRExC_state);
10432 if (UCHARAT(RExC_parse) != ':') {
10434 nextchar(pRExC_state);
10439 nextchar(pRExC_state);
10444 else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
10449 ret = reganode(pRExC_state, OPEN, parno);
10451 if (!RExC_nestroot)
10452 RExC_nestroot = parno;
10453 if (RExC_seen & REG_RECURSE_SEEN
10454 && !RExC_open_parens[parno-1])
10456 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10457 "%*s%*s Setting open paren #%"IVdf" to %d\n",
10458 22, "| |", (int)(depth * 2 + 1), "",
10459 (IV)parno, REG_NODE_NUM(ret)));
10460 RExC_open_parens[parno-1]= ret;
10463 Set_Node_Length(ret, 1); /* MJD */
10464 Set_Node_Offset(ret, RExC_parse); /* MJD */
10467 /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */
10476 /* Pick up the branches, linking them together. */
10477 parse_start = RExC_parse; /* MJD */
10478 br = regbranch(pRExC_state, &flags, 1,depth+1);
10480 /* branch_len = (paren != 0); */
10483 if (flags & RESTART_UTF8) {
10484 *flagp = RESTART_UTF8;
10487 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10489 if (*RExC_parse == '|') {
10490 if (!SIZE_ONLY && RExC_extralen) {
10491 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10494 reginsert(pRExC_state, BRANCH, br, depth+1);
10495 Set_Node_Length(br, paren != 0);
10496 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10500 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10502 else if (paren == ':') {
10503 *flagp |= flags&SIMPLE;
10505 if (is_open) { /* Starts with OPEN. */
10506 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10508 else if (paren != '?') /* Not Conditional */
10510 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10512 while (*RExC_parse == '|') {
10513 if (!SIZE_ONLY && RExC_extralen) {
10514 ender = reganode(pRExC_state, LONGJMP,0);
10516 /* Append to the previous. */
10517 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10520 RExC_extralen += 2; /* Account for LONGJMP. */
10521 nextchar(pRExC_state);
10522 if (freeze_paren) {
10523 if (RExC_npar > after_freeze)
10524 after_freeze = RExC_npar;
10525 RExC_npar = freeze_paren;
10527 br = regbranch(pRExC_state, &flags, 0, depth+1);
10530 if (flags & RESTART_UTF8) {
10531 *flagp = RESTART_UTF8;
10534 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10536 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10538 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10541 if (have_branch || paren != ':') {
10542 /* Make a closing node, and hook it on the end. */
10545 ender = reg_node(pRExC_state, TAIL);
10548 ender = reganode(pRExC_state, CLOSE, parno);
10549 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10550 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10551 "%*s%*s Setting close paren #%"IVdf" to %d\n",
10552 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
10553 RExC_close_parens[parno-1]= ender;
10554 if (RExC_nestroot == parno)
10557 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10558 Set_Node_Length(ender,1); /* MJD */
10564 *flagp &= ~HASWIDTH;
10567 ender = reg_node(pRExC_state, SUCCEED);
10570 ender = reg_node(pRExC_state, END);
10572 assert(!RExC_opend); /* there can only be one! */
10573 RExC_opend = ender;
10577 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10578 DEBUG_PARSE_MSG("lsbr");
10579 regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
10580 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10581 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10582 SvPV_nolen_const(RExC_mysv1),
10583 (IV)REG_NODE_NUM(lastbr),
10584 SvPV_nolen_const(RExC_mysv2),
10585 (IV)REG_NODE_NUM(ender),
10586 (IV)(ender - lastbr)
10589 REGTAIL(pRExC_state, lastbr, ender);
10591 if (have_branch && !SIZE_ONLY) {
10592 char is_nothing= 1;
10594 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10596 /* Hook the tails of the branches to the closing node. */
10597 for (br = ret; br; br = regnext(br)) {
10598 const U8 op = PL_regkind[OP(br)];
10599 if (op == BRANCH) {
10600 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10601 if ( OP(NEXTOPER(br)) != NOTHING
10602 || regnext(NEXTOPER(br)) != ender)
10605 else if (op == BRANCHJ) {
10606 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10607 /* for now we always disable this optimisation * /
10608 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10609 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10615 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10616 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10617 DEBUG_PARSE_MSG("NADA");
10618 regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
10619 regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
10620 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10621 SvPV_nolen_const(RExC_mysv1),
10622 (IV)REG_NODE_NUM(ret),
10623 SvPV_nolen_const(RExC_mysv2),
10624 (IV)REG_NODE_NUM(ender),
10629 if (OP(ender) == TAIL) {
10634 for ( opt= br + 1; opt < ender ; opt++ )
10635 OP(opt)= OPTIMIZED;
10636 NEXT_OFF(br)= ender - br;
10644 static const char parens[] = "=!<,>";
10646 if (paren && (p = strchr(parens, paren))) {
10647 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10648 int flag = (p - parens) > 1;
10651 node = SUSPEND, flag = 0;
10652 reginsert(pRExC_state, node,ret, depth+1);
10653 Set_Node_Cur_Length(ret, parse_start);
10654 Set_Node_Offset(ret, parse_start + 1);
10656 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10660 /* Check for proper termination. */
10662 /* restore original flags, but keep (?p) */
10663 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10664 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10665 RExC_parse = oregcomp_parse;
10666 vFAIL("Unmatched (");
10669 else if (!paren && RExC_parse < RExC_end) {
10670 if (*RExC_parse == ')') {
10672 vFAIL("Unmatched )");
10675 FAIL("Junk on end of regexp"); /* "Can't happen". */
10676 NOT_REACHED; /* NOTREACHED */
10679 if (RExC_in_lookbehind) {
10680 RExC_in_lookbehind--;
10682 if (after_freeze > RExC_npar)
10683 RExC_npar = after_freeze;
10688 - regbranch - one alternative of an | operator
10690 * Implements the concatenation operator.
10692 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10696 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10699 regnode *chain = NULL;
10701 I32 flags = 0, c = 0;
10702 GET_RE_DEBUG_FLAGS_DECL;
10704 PERL_ARGS_ASSERT_REGBRANCH;
10706 DEBUG_PARSE("brnc");
10711 if (!SIZE_ONLY && RExC_extralen)
10712 ret = reganode(pRExC_state, BRANCHJ,0);
10714 ret = reg_node(pRExC_state, BRANCH);
10715 Set_Node_Length(ret, 1);
10719 if (!first && SIZE_ONLY)
10720 RExC_extralen += 1; /* BRANCHJ */
10722 *flagp = WORST; /* Tentatively. */
10725 nextchar(pRExC_state);
10726 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10727 flags &= ~TRYAGAIN;
10728 latest = regpiece(pRExC_state, &flags,depth+1);
10729 if (latest == NULL) {
10730 if (flags & TRYAGAIN)
10732 if (flags & RESTART_UTF8) {
10733 *flagp = RESTART_UTF8;
10736 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10738 else if (ret == NULL)
10740 *flagp |= flags&(HASWIDTH|POSTPONED);
10741 if (chain == NULL) /* First piece. */
10742 *flagp |= flags&SPSTART;
10744 /* FIXME adding one for every branch after the first is probably
10745 * excessive now we have TRIE support. (hv) */
10747 REGTAIL(pRExC_state, chain, latest);
10752 if (chain == NULL) { /* Loop ran zero times. */
10753 chain = reg_node(pRExC_state, NOTHING);
10758 *flagp |= flags&SIMPLE;
10765 - regpiece - something followed by possible [*+?]
10767 * Note that the branching code sequences used for ? and the general cases
10768 * of * and + are somewhat optimized: they use the same NOTHING node as
10769 * both the endmarker for their branch list and the body of the last branch.
10770 * It might seem that this node could be dispensed with entirely, but the
10771 * endmarker role is not redundant.
10773 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10775 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10779 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10785 const char * const origparse = RExC_parse;
10787 I32 max = REG_INFTY;
10788 #ifdef RE_TRACK_PATTERN_OFFSETS
10791 const char *maxpos = NULL;
10794 /* Save the original in case we change the emitted regop to a FAIL. */
10795 regnode * const orig_emit = RExC_emit;
10797 GET_RE_DEBUG_FLAGS_DECL;
10799 PERL_ARGS_ASSERT_REGPIECE;
10801 DEBUG_PARSE("piec");
10803 ret = regatom(pRExC_state, &flags,depth+1);
10805 if (flags & (TRYAGAIN|RESTART_UTF8))
10806 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10808 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10814 if (op == '{' && regcurly(RExC_parse)) {
10816 #ifdef RE_TRACK_PATTERN_OFFSETS
10817 parse_start = RExC_parse; /* MJD */
10819 next = RExC_parse + 1;
10820 while (isDIGIT(*next) || *next == ',') {
10821 if (*next == ',') {
10829 if (*next == '}') { /* got one */
10830 const char* endptr;
10834 if (isDIGIT(*RExC_parse)) {
10835 if (!grok_atoUV(RExC_parse, &uv, &endptr))
10836 vFAIL("Invalid quantifier in {,}");
10837 if (uv >= REG_INFTY)
10838 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10843 if (*maxpos == ',')
10846 maxpos = RExC_parse;
10847 if (isDIGIT(*maxpos)) {
10848 if (!grok_atoUV(maxpos, &uv, &endptr))
10849 vFAIL("Invalid quantifier in {,}");
10850 if (uv >= REG_INFTY)
10851 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10854 max = REG_INFTY; /* meaning "infinity" */
10857 nextchar(pRExC_state);
10858 if (max < min) { /* If can't match, warn and optimize to fail
10862 /* We can't back off the size because we have to reserve
10863 * enough space for all the things we are about to throw
10864 * away, but we can shrink it by the ammount we are about
10865 * to re-use here */
10866 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10869 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10870 RExC_emit = orig_emit;
10872 ret = reg_node(pRExC_state, OPFAIL);
10875 else if (min == max
10876 && RExC_parse < RExC_end
10877 && (*RExC_parse == '?' || *RExC_parse == '+'))
10880 ckWARN2reg(RExC_parse + 1,
10881 "Useless use of greediness modifier '%c'",
10884 /* Absorb the modifier, so later code doesn't see nor use
10886 nextchar(pRExC_state);
10890 if ((flags&SIMPLE)) {
10891 MARK_NAUGHTY_EXP(2, 2);
10892 reginsert(pRExC_state, CURLY, ret, depth+1);
10893 Set_Node_Offset(ret, parse_start+1); /* MJD */
10894 Set_Node_Cur_Length(ret, parse_start);
10897 regnode * const w = reg_node(pRExC_state, WHILEM);
10900 REGTAIL(pRExC_state, ret, w);
10901 if (!SIZE_ONLY && RExC_extralen) {
10902 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10903 reginsert(pRExC_state, NOTHING,ret, depth+1);
10904 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10906 reginsert(pRExC_state, CURLYX,ret, depth+1);
10908 Set_Node_Offset(ret, parse_start+1);
10909 Set_Node_Length(ret,
10910 op == '{' ? (RExC_parse - parse_start) : 1);
10912 if (!SIZE_ONLY && RExC_extralen)
10913 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10914 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10916 RExC_whilem_seen++, RExC_extralen += 3;
10917 MARK_NAUGHTY_EXP(1, 4); /* compound interest */
10924 *flagp |= HASWIDTH;
10926 ARG1_SET(ret, (U16)min);
10927 ARG2_SET(ret, (U16)max);
10929 if (max == REG_INFTY)
10930 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10936 if (!ISMULT1(op)) {
10941 #if 0 /* Now runtime fix should be reliable. */
10943 /* if this is reinstated, don't forget to put this back into perldiag:
10945 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10947 (F) The part of the regexp subject to either the * or + quantifier
10948 could match an empty string. The {#} shows in the regular
10949 expression about where the problem was discovered.
10953 if (!(flags&HASWIDTH) && op != '?')
10954 vFAIL("Regexp *+ operand could be empty");
10957 #ifdef RE_TRACK_PATTERN_OFFSETS
10958 parse_start = RExC_parse;
10960 nextchar(pRExC_state);
10962 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10964 if (op == '*' && (flags&SIMPLE)) {
10965 reginsert(pRExC_state, STAR, ret, depth+1);
10968 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10970 else if (op == '*') {
10974 else if (op == '+' && (flags&SIMPLE)) {
10975 reginsert(pRExC_state, PLUS, ret, depth+1);
10978 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10980 else if (op == '+') {
10984 else if (op == '?') {
10989 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10990 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10991 ckWARN2reg(RExC_parse,
10992 "%"UTF8f" matches null string many times",
10993 UTF8fARG(UTF, (RExC_parse >= origparse
10994 ? RExC_parse - origparse
10997 (void)ReREFCNT_inc(RExC_rx_sv);
11000 if (RExC_parse < RExC_end && *RExC_parse == '?') {
11001 nextchar(pRExC_state);
11002 reginsert(pRExC_state, MINMOD, ret, depth+1);
11003 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
11006 if (RExC_parse < RExC_end && *RExC_parse == '+') {
11008 nextchar(pRExC_state);
11009 ender = reg_node(pRExC_state, SUCCEED);
11010 REGTAIL(pRExC_state, ret, ender);
11011 reginsert(pRExC_state, SUSPEND, ret, depth+1);
11013 ender = reg_node(pRExC_state, TAIL);
11014 REGTAIL(pRExC_state, ret, ender);
11017 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
11019 vFAIL("Nested quantifiers");
11026 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
11034 /* This routine teases apart the various meanings of \N and returns
11035 * accordingly. The input parameters constrain which meaning(s) is/are valid
11036 * in the current context.
11038 * Exactly one of <node_p> and <code_point_p> must be non-NULL.
11040 * If <code_point_p> is not NULL, the context is expecting the result to be a
11041 * single code point. If this \N instance turns out to a single code point,
11042 * the function returns TRUE and sets *code_point_p to that code point.
11044 * If <node_p> is not NULL, the context is expecting the result to be one of
11045 * the things representable by a regnode. If this \N instance turns out to be
11046 * one such, the function generates the regnode, returns TRUE and sets *node_p
11047 * to point to that regnode.
11049 * If this instance of \N isn't legal in any context, this function will
11050 * generate a fatal error and not return.
11052 * On input, RExC_parse should point to the first char following the \N at the
11053 * time of the call. On successful return, RExC_parse will have been updated
11054 * to point to just after the sequence identified by this routine. Also
11055 * *flagp has been updated as needed.
11057 * When there is some problem with the current context and this \N instance,
11058 * the function returns FALSE, without advancing RExC_parse, nor setting
11059 * *node_p, nor *code_point_p, nor *flagp.
11061 * If <cp_count> is not NULL, the caller wants to know the length (in code
11062 * points) that this \N sequence matches. This is set even if the function
11063 * returns FALSE, as detailed below.
11065 * There are 5 possibilities here, as detailed in the next 5 paragraphs.
11067 * Probably the most common case is for the \N to specify a single code point.
11068 * *cp_count will be set to 1, and *code_point_p will be set to that code
11071 * Another possibility is for the input to be an empty \N{}, which for
11072 * backwards compatibility we accept. *cp_count will be set to 0. *node_p
11073 * will be set to a generated NOTHING node.
11075 * Still another possibility is for the \N to mean [^\n]. *cp_count will be
11076 * set to 0. *node_p will be set to a generated REG_ANY node.
11078 * The fourth possibility is that \N resolves to a sequence of more than one
11079 * code points. *cp_count will be set to the number of code points in the
11080 * sequence. *node_p * will be set to a generated node returned by this
11081 * function calling S_reg().
11083 * The final possibility, which happens only when the fourth one would
11084 * otherwise be in effect, is that one of those code points requires the
11085 * pattern to be recompiled as UTF-8. The function returns FALSE, and sets
11086 * the RESTART_UTF8 flag in *flagp. When this happens, the caller needs to
11087 * desist from continuing parsing, and return this information to its caller.
11088 * This is not set for when there is only one code point, as this can be
11089 * called as part of an ANYOF node, and they can store above-Latin1 code
11090 * points without the pattern having to be in UTF-8.
11092 * For non-single-quoted regexes, the tokenizer has resolved character and
11093 * sequence names inside \N{...} into their Unicode values, normalizing the
11094 * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
11095 * hex-represented code points in the sequence. This is done there because
11096 * the names can vary based on what charnames pragma is in scope at the time,
11097 * so we need a way to take a snapshot of what they resolve to at the time of
11098 * the original parse. [perl #56444].
11100 * That parsing is skipped for single-quoted regexes, so we may here get
11101 * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
11102 * parser. But if the single-quoted regex is something like '\N{U+41}', that
11103 * is legal and handled here. The code point is Unicode, and has to be
11104 * translated into the native character set for non-ASCII platforms.
11105 * the tokenizer passes the \N sequence through unchanged; this code will not
11106 * attempt to determine this nor expand those, instead raising a syntax error.
11109 char * endbrace; /* points to '}' following the name */
11110 char *endchar; /* Points to '.' or '}' ending cur char in the input
11112 char* p; /* Temporary */
11114 GET_RE_DEBUG_FLAGS_DECL;
11116 PERL_ARGS_ASSERT_GROK_BSLASH_N;
11118 GET_RE_DEBUG_FLAGS;
11120 assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
11121 assert(! (node_p && cp_count)); /* At most 1 should be set */
11123 if (cp_count) { /* Initialize return for the most common case */
11127 /* The [^\n] meaning of \N ignores spaces and comments under the /x
11128 * modifier. The other meanings do not, so use a temporary until we find
11129 * out which we are being called with */
11130 p = (RExC_flags & RXf_PMf_EXTENDED)
11131 ? regpatws(pRExC_state, RExC_parse,
11132 TRUE) /* means recognize comments */
11135 /* Disambiguate between \N meaning a named character versus \N meaning
11136 * [^\n]. The latter is assumed when the {...} following the \N is a legal
11137 * quantifier, or there is no a '{' at all */
11138 if (*p != '{' || regcurly(p)) {
11147 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
11149 nextchar(pRExC_state);
11150 *node_p = reg_node(pRExC_state, REG_ANY);
11151 *flagp |= HASWIDTH|SIMPLE;
11153 Set_Node_Length(*node_p, 1); /* MJD */
11157 /* Here, we have decided it should be a named character or sequence */
11159 /* The test above made sure that the next real character is a '{', but
11160 * under the /x modifier, it could be separated by space (or a comment and
11161 * \n) and this is not allowed (for consistency with \x{...} and the
11162 * tokenizer handling of \N{NAME}). */
11163 if (*RExC_parse != '{') {
11164 vFAIL("Missing braces on \\N{}");
11167 RExC_parse++; /* Skip past the '{' */
11169 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
11170 || ! (endbrace == RExC_parse /* nothing between the {} */
11171 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
11172 && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
11175 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
11176 vFAIL("\\N{NAME} must be resolved by the lexer");
11179 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
11181 if (endbrace == RExC_parse) { /* empty: \N{} */
11185 nextchar(pRExC_state);
11190 *node_p = reg_node(pRExC_state,NOTHING);
11194 RExC_parse += 2; /* Skip past the 'U+' */
11196 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11198 /* Code points are separated by dots. If none, there is only one code
11199 * point, and is terminated by the brace */
11201 if (endchar >= endbrace) {
11202 STRLEN length_of_hex;
11203 I32 grok_hex_flags;
11205 /* Here, exactly one code point. If that isn't what is wanted, fail */
11206 if (! code_point_p) {
11211 /* Convert code point from hex */
11212 length_of_hex = (STRLEN)(endchar - RExC_parse);
11213 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
11214 | PERL_SCAN_DISALLOW_PREFIX
11216 /* No errors in the first pass (See [perl
11217 * #122671].) We let the code below find the
11218 * errors when there are multiple chars. */
11220 ? PERL_SCAN_SILENT_ILLDIGIT
11223 /* This routine is the one place where both single- and double-quotish
11224 * \N{U+xxxx} are evaluated. The value is a Unicode code point which
11225 * must be converted to native. */
11226 *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
11231 /* The tokenizer should have guaranteed validity, but it's possible to
11232 * bypass it by using single quoting, so check. Don't do the check
11233 * here when there are multiple chars; we do it below anyway. */
11234 if (length_of_hex == 0
11235 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
11237 RExC_parse += length_of_hex; /* Includes all the valid */
11238 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
11239 ? UTF8SKIP(RExC_parse)
11241 /* Guard against malformed utf8 */
11242 if (RExC_parse >= endchar) {
11243 RExC_parse = endchar;
11245 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11248 RExC_parse = endbrace + 1;
11251 else { /* Is a multiple character sequence */
11252 SV * substitute_parse;
11254 char *orig_end = RExC_end;
11257 /* Count the code points, if desired, in the sequence */
11260 while (RExC_parse < endbrace) {
11261 /* Point to the beginning of the next character in the sequence. */
11262 RExC_parse = endchar + 1;
11263 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11268 /* Fail if caller doesn't want to handle a multi-code-point sequence.
11269 * But don't backup up the pointer if the caller want to know how many
11270 * code points there are (they can then handle things) */
11278 /* What is done here is to convert this to a sub-pattern of the form
11279 * \x{char1}\x{char2}... and then call reg recursively to parse it
11280 * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
11281 * while not having to worry about special handling that some code
11282 * points may have. */
11284 substitute_parse = newSVpvs("?:");
11286 while (RExC_parse < endbrace) {
11288 /* Convert to notation the rest of the code understands */
11289 sv_catpv(substitute_parse, "\\x{");
11290 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
11291 sv_catpv(substitute_parse, "}");
11293 /* Point to the beginning of the next character in the sequence. */
11294 RExC_parse = endchar + 1;
11295 endchar = RExC_parse + strcspn(RExC_parse, ".}");
11298 sv_catpv(substitute_parse, ")");
11300 RExC_parse = SvPV(substitute_parse, len);
11302 /* Don't allow empty number */
11303 if (len < (STRLEN) 8) {
11304 RExC_parse = endbrace;
11305 vFAIL("Invalid hexadecimal number in \\N{U+...}");
11307 RExC_end = RExC_parse + len;
11309 /* The values are Unicode, and therefore not subject to recoding, but
11310 * have to be converted to native on a non-Unicode (meaning non-ASCII)
11312 RExC_override_recoding = 1;
11314 RExC_recode_x_to_native = 1;
11318 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
11319 if (flags & RESTART_UTF8) {
11320 *flagp = RESTART_UTF8;
11323 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11326 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11329 /* Restore the saved values */
11330 RExC_parse = endbrace;
11331 RExC_end = orig_end;
11332 RExC_override_recoding = 0;
11334 RExC_recode_x_to_native = 0;
11337 SvREFCNT_dec_NN(substitute_parse);
11338 nextchar(pRExC_state);
11348 * It returns the code point in utf8 for the value in *encp.
11349 * value: a code value in the source encoding
11350 * encp: a pointer to an Encode object
11352 * If the result from Encode is not a single character,
11353 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11356 S_reg_recode(pTHX_ const char value, SV **encp)
11359 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11360 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11361 const STRLEN newlen = SvCUR(sv);
11362 UV uv = UNICODE_REPLACEMENT;
11364 PERL_ARGS_ASSERT_REG_RECODE;
11368 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11371 if (!newlen || numlen != newlen) {
11372 uv = UNICODE_REPLACEMENT;
11378 PERL_STATIC_INLINE U8
11379 S_compute_EXACTish(RExC_state_t *pRExC_state)
11383 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11391 op = get_regex_charset(RExC_flags);
11392 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11393 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11394 been, so there is no hole */
11397 return op + EXACTF;
11400 PERL_STATIC_INLINE void
11401 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11402 regnode *node, I32* flagp, STRLEN len, UV code_point,
11405 /* This knows the details about sizing an EXACTish node, setting flags for
11406 * it (by setting <*flagp>, and potentially populating it with a single
11409 * If <len> (the length in bytes) is non-zero, this function assumes that
11410 * the node has already been populated, and just does the sizing. In this
11411 * case <code_point> should be the final code point that has already been
11412 * placed into the node. This value will be ignored except that under some
11413 * circumstances <*flagp> is set based on it.
11415 * If <len> is zero, the function assumes that the node is to contain only
11416 * the single character given by <code_point> and calculates what <len>
11417 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11418 * additionally will populate the node's STRING with <code_point> or its
11421 * In both cases <*flagp> is appropriately set
11423 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11424 * 255, must be folded (the former only when the rules indicate it can
11427 * When it does the populating, it looks at the flag 'downgradable'. If
11428 * true with a node that folds, it checks if the single code point
11429 * participates in a fold, and if not downgrades the node to an EXACT.
11430 * This helps the optimizer */
11432 bool len_passed_in = cBOOL(len != 0);
11433 U8 character[UTF8_MAXBYTES_CASE+1];
11435 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11437 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11438 * sizing difference, and is extra work that is thrown away */
11439 if (downgradable && ! PASS2) {
11440 downgradable = FALSE;
11443 if (! len_passed_in) {
11445 if (UVCHR_IS_INVARIANT(code_point)) {
11446 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11447 *character = (U8) code_point;
11449 else { /* Here is /i and not /l. (toFOLD() is defined on just
11450 ASCII, which isn't the same thing as INVARIANT on
11451 EBCDIC, but it works there, as the extra invariants
11452 fold to themselves) */
11453 *character = toFOLD((U8) code_point);
11455 /* We can downgrade to an EXACT node if this character
11456 * isn't a folding one. Note that this assumes that
11457 * nothing above Latin1 folds to some other invariant than
11458 * one of these alphabetics; otherwise we would also have
11460 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11461 * || ASCII_FOLD_RESTRICTED))
11463 if (downgradable && PL_fold[code_point] == code_point) {
11469 else if (FOLD && (! LOC
11470 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11471 { /* Folding, and ok to do so now */
11472 UV folded = _to_uni_fold_flags(
11476 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11477 ? FOLD_FLAGS_NOMIX_ASCII
11480 && folded == code_point /* This quickly rules out many
11481 cases, avoiding the
11482 _invlist_contains_cp() overhead
11484 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11491 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11493 /* Not folding this cp, and can output it directly */
11494 *character = UTF8_TWO_BYTE_HI(code_point);
11495 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11499 uvchr_to_utf8( character, code_point);
11500 len = UTF8SKIP(character);
11502 } /* Else pattern isn't UTF8. */
11504 *character = (U8) code_point;
11506 } /* Else is folded non-UTF8 */
11507 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11509 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11510 * comments at join_exact()); */
11511 *character = (U8) code_point;
11514 /* Can turn into an EXACT node if we know the fold at compile time,
11515 * and it folds to itself and doesn't particpate in other folds */
11518 && PL_fold_latin1[code_point] == code_point
11519 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11520 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11524 } /* else is Sharp s. May need to fold it */
11525 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11527 *(character + 1) = 's';
11531 *character = LATIN_SMALL_LETTER_SHARP_S;
11537 RExC_size += STR_SZ(len);
11540 RExC_emit += STR_SZ(len);
11541 STR_LEN(node) = len;
11542 if (! len_passed_in) {
11543 Copy((char *) character, STRING(node), len, char);
11547 *flagp |= HASWIDTH;
11549 /* A single character node is SIMPLE, except for the special-cased SHARP S
11551 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11552 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11553 || ! FOLD || ! DEPENDS_SEMANTICS))
11558 /* The OP may not be well defined in PASS1 */
11559 if (PASS2 && OP(node) == EXACTFL) {
11560 RExC_contains_locale = 1;
11565 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11566 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11569 S_backref_value(char *p)
11571 const char* endptr;
11573 if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX)
11580 - regatom - the lowest level
11582 Try to identify anything special at the start of the pattern. If there
11583 is, then handle it as required. This may involve generating a single regop,
11584 such as for an assertion; or it may involve recursing, such as to
11585 handle a () structure.
11587 If the string doesn't start with something special then we gobble up
11588 as much literal text as we can.
11590 Once we have been able to handle whatever type of thing started the
11591 sequence, we return.
11593 Note: we have to be careful with escapes, as they can be both literal
11594 and special, and in the case of \10 and friends, context determines which.
11596 A summary of the code structure is:
11598 switch (first_byte) {
11599 cases for each special:
11600 handle this special;
11603 switch (2nd byte) {
11604 cases for each unambiguous special:
11605 handle this special;
11607 cases for each ambigous special/literal:
11609 if (special) handle here
11611 default: // unambiguously literal:
11614 default: // is a literal char
11617 create EXACTish node for literal;
11618 while (more input and node isn't full) {
11619 switch (input_byte) {
11620 cases for each special;
11621 make sure parse pointer is set so that the next call to
11622 regatom will see this special first
11623 goto loopdone; // EXACTish node terminated by prev. char
11625 append char to EXACTISH node;
11627 get next input byte;
11631 return the generated node;
11633 Specifically there are two separate switches for handling
11634 escape sequences, with the one for handling literal escapes requiring
11635 a dummy entry for all of the special escapes that are actually handled
11638 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11640 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11642 Otherwise does not return NULL.
11646 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11648 regnode *ret = NULL;
11650 char *parse_start = RExC_parse;
11655 GET_RE_DEBUG_FLAGS_DECL;
11657 *flagp = WORST; /* Tentatively. */
11659 DEBUG_PARSE("atom");
11661 PERL_ARGS_ASSERT_REGATOM;
11664 switch ((U8)*RExC_parse) {
11666 RExC_seen_zerolen++;
11667 nextchar(pRExC_state);
11668 if (RExC_flags & RXf_PMf_MULTILINE)
11669 ret = reg_node(pRExC_state, MBOL);
11671 ret = reg_node(pRExC_state, SBOL);
11672 Set_Node_Length(ret, 1); /* MJD */
11675 nextchar(pRExC_state);
11677 RExC_seen_zerolen++;
11678 if (RExC_flags & RXf_PMf_MULTILINE)
11679 ret = reg_node(pRExC_state, MEOL);
11681 ret = reg_node(pRExC_state, SEOL);
11682 Set_Node_Length(ret, 1); /* MJD */
11685 nextchar(pRExC_state);
11686 if (RExC_flags & RXf_PMf_SINGLELINE)
11687 ret = reg_node(pRExC_state, SANY);
11689 ret = reg_node(pRExC_state, REG_ANY);
11690 *flagp |= HASWIDTH|SIMPLE;
11692 Set_Node_Length(ret, 1); /* MJD */
11696 char * const oregcomp_parse = ++RExC_parse;
11697 ret = regclass(pRExC_state, flagp,depth+1,
11698 FALSE, /* means parse the whole char class */
11699 TRUE, /* allow multi-char folds */
11700 FALSE, /* don't silence non-portable warnings. */
11701 (bool) RExC_strict,
11703 if (*RExC_parse != ']') {
11704 RExC_parse = oregcomp_parse;
11705 vFAIL("Unmatched [");
11708 if (*flagp & RESTART_UTF8)
11710 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11713 nextchar(pRExC_state);
11714 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11718 nextchar(pRExC_state);
11719 ret = reg(pRExC_state, 2, &flags,depth+1);
11721 if (flags & TRYAGAIN) {
11722 if (RExC_parse == RExC_end) {
11723 /* Make parent create an empty node if needed. */
11724 *flagp |= TRYAGAIN;
11729 if (flags & RESTART_UTF8) {
11730 *flagp = RESTART_UTF8;
11733 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11736 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11740 if (flags & TRYAGAIN) {
11741 *flagp |= TRYAGAIN;
11744 vFAIL("Internal urp");
11745 /* Supposed to be caught earlier. */
11751 vFAIL("Quantifier follows nothing");
11756 This switch handles escape sequences that resolve to some kind
11757 of special regop and not to literal text. Escape sequnces that
11758 resolve to literal text are handled below in the switch marked
11761 Every entry in this switch *must* have a corresponding entry
11762 in the literal escape switch. However, the opposite is not
11763 required, as the default for this switch is to jump to the
11764 literal text handling code.
11766 switch ((U8)*++RExC_parse) {
11767 /* Special Escapes */
11769 RExC_seen_zerolen++;
11770 ret = reg_node(pRExC_state, SBOL);
11771 /* SBOL is shared with /^/ so we set the flags so we can tell
11772 * /\A/ from /^/ in split. We check ret because first pass we
11773 * have no regop struct to set the flags on. */
11777 goto finish_meta_pat;
11779 ret = reg_node(pRExC_state, GPOS);
11780 RExC_seen |= REG_GPOS_SEEN;
11782 goto finish_meta_pat;
11784 RExC_seen_zerolen++;
11785 ret = reg_node(pRExC_state, KEEPS);
11787 /* XXX:dmq : disabling in-place substitution seems to
11788 * be necessary here to avoid cases of memory corruption, as
11789 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11791 RExC_seen |= REG_LOOKBEHIND_SEEN;
11792 goto finish_meta_pat;
11794 ret = reg_node(pRExC_state, SEOL);
11796 RExC_seen_zerolen++; /* Do not optimize RE away */
11797 goto finish_meta_pat;
11799 ret = reg_node(pRExC_state, EOS);
11801 RExC_seen_zerolen++; /* Do not optimize RE away */
11802 goto finish_meta_pat;
11804 vFAIL("\\C no longer supported");
11806 ret = reg_node(pRExC_state, CLUMP);
11807 *flagp |= HASWIDTH;
11808 goto finish_meta_pat;
11814 arg = ANYOF_WORDCHAR;
11822 regex_charset charset = get_regex_charset(RExC_flags);
11824 RExC_seen_zerolen++;
11825 RExC_seen |= REG_LOOKBEHIND_SEEN;
11826 op = BOUND + charset;
11828 if (op == BOUNDL) {
11829 RExC_contains_locale = 1;
11832 ret = reg_node(pRExC_state, op);
11834 if (*(RExC_parse + 1) != '{') {
11835 FLAGS(ret) = TRADITIONAL_BOUND;
11836 if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
11842 char name = *RExC_parse;
11845 endbrace = strchr(RExC_parse, '}');
11848 vFAIL2("Missing right brace on \\%c{}", name);
11850 /* XXX Need to decide whether to take spaces or not. Should be
11851 * consistent with \p{}, but that currently is SPACE, which
11852 * means vertical too, which seems wrong
11853 * while (isBLANK(*RExC_parse)) {
11856 if (endbrace == RExC_parse) {
11857 RExC_parse++; /* After the '}' */
11858 vFAIL2("Empty \\%c{}", name);
11860 length = endbrace - RExC_parse;
11861 /*while (isBLANK(*(RExC_parse + length - 1))) {
11864 switch (*RExC_parse) {
11867 && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
11869 goto bad_bound_type;
11871 FLAGS(ret) = GCB_BOUND;
11874 if (length != 2 || *(RExC_parse + 1) != 'b') {
11875 goto bad_bound_type;
11877 FLAGS(ret) = SB_BOUND;
11880 if (length != 2 || *(RExC_parse + 1) != 'b') {
11881 goto bad_bound_type;
11883 FLAGS(ret) = WB_BOUND;
11887 RExC_parse = endbrace;
11889 "'%"UTF8f"' is an unknown bound type",
11890 UTF8fARG(UTF, length, endbrace - length));
11891 NOT_REACHED; /*NOTREACHED*/
11893 RExC_parse = endbrace;
11894 RExC_uni_semantics = 1;
11896 if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
11900 /* Don't have to worry about UTF-8, in this message because
11901 * to get here the contents of the \b must be ASCII */
11902 ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */
11903 "Using /u for '%.*s' instead of /%s",
11905 endbrace - length + 1,
11906 (charset == REGEX_ASCII_RESTRICTED_CHARSET)
11907 ? ASCII_RESTRICT_PAT_MODS
11908 : ASCII_MORE_RESTRICT_PAT_MODS);
11912 if (PASS2 && invert) {
11913 OP(ret) += NBOUND - BOUND;
11915 goto finish_meta_pat;
11923 if (! DEPENDS_SEMANTICS) {
11927 /* \d doesn't have any matches in the upper Latin1 range, hence /d
11928 * is equivalent to /u. Changing to /u saves some branches at
11931 goto join_posix_op_known;
11934 ret = reg_node(pRExC_state, LNBREAK);
11935 *flagp |= HASWIDTH|SIMPLE;
11936 goto finish_meta_pat;
11944 goto join_posix_op_known;
11950 arg = ANYOF_VERTWS;
11952 goto join_posix_op_known;
11962 op = POSIXD + get_regex_charset(RExC_flags);
11963 if (op > POSIXA) { /* /aa is same as /a */
11966 else if (op == POSIXL) {
11967 RExC_contains_locale = 1;
11970 join_posix_op_known:
11973 op += NPOSIXD - POSIXD;
11976 ret = reg_node(pRExC_state, op);
11978 FLAGS(ret) = namedclass_to_classnum(arg);
11981 *flagp |= HASWIDTH|SIMPLE;
11985 nextchar(pRExC_state);
11986 Set_Node_Length(ret, 2); /* MJD */
11992 char* parse_start = RExC_parse - 2;
11997 ret = regclass(pRExC_state, flagp,depth+1,
11998 TRUE, /* means just parse this element */
11999 FALSE, /* don't allow multi-char folds */
12000 FALSE, /* don't silence non-portable warnings.
12001 It would be a bug if these returned
12003 (bool) RExC_strict,
12005 /* regclass() can only return RESTART_UTF8 if multi-char folds
12008 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
12013 Set_Node_Offset(ret, parse_start + 2);
12014 Set_Node_Cur_Length(ret, parse_start);
12015 nextchar(pRExC_state);
12019 /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
12020 * \N{...} evaluates to a sequence of more than one code points).
12021 * The function call below returns a regnode, which is our result.
12022 * The parameters cause it to fail if the \N{} evaluates to a
12023 * single code point; we handle those like any other literal. The
12024 * reason that the multicharacter case is handled here and not as
12025 * part of the EXACtish code is because of quantifiers. In
12026 * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
12027 * this way makes that Just Happen. dmq.
12028 * join_exact() will join this up with adjacent EXACTish nodes
12029 * later on, if appropriate. */
12031 if (grok_bslash_N(pRExC_state,
12032 &ret, /* Want a regnode returned */
12033 NULL, /* Fail if evaluates to a single code
12035 NULL, /* Don't need a count of how many code
12043 if (*flagp & RESTART_UTF8)
12048 case 'k': /* Handle \k<NAME> and \k'NAME' */
12051 char ch= RExC_parse[1];
12052 if (ch != '<' && ch != '\'' && ch != '{') {
12054 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12055 vFAIL2("Sequence %.2s... not terminated",parse_start);
12057 /* this pretty much dupes the code for (?P=...) in reg(), if
12058 you change this make sure you change that */
12059 char* name_start = (RExC_parse += 2);
12061 SV *sv_dat = reg_scan_name(pRExC_state,
12062 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
12063 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
12064 if (RExC_parse == name_start || *RExC_parse != ch)
12065 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
12066 vFAIL2("Sequence %.3s... not terminated",parse_start);
12069 num = add_data( pRExC_state, STR_WITH_LEN("S"));
12070 RExC_rxi->data->data[num]=(void*)sv_dat;
12071 SvREFCNT_inc_simple_void(sv_dat);
12075 ret = reganode(pRExC_state,
12078 : (ASCII_FOLD_RESTRICTED)
12080 : (AT_LEAST_UNI_SEMANTICS)
12086 *flagp |= HASWIDTH;
12088 /* override incorrect value set in reganode MJD */
12089 Set_Node_Offset(ret, parse_start+1);
12090 Set_Node_Cur_Length(ret, parse_start);
12091 nextchar(pRExC_state);
12097 case '1': case '2': case '3': case '4':
12098 case '5': case '6': case '7': case '8': case '9':
12103 if (*RExC_parse == 'g') {
12107 if (*RExC_parse == '{') {
12111 if (*RExC_parse == '-') {
12115 if (hasbrace && !isDIGIT(*RExC_parse)) {
12116 if (isrel) RExC_parse--;
12118 goto parse_named_seq;
12121 num = S_backref_value(RExC_parse);
12123 vFAIL("Reference to invalid group 0");
12124 else if (num == I32_MAX) {
12125 if (isDIGIT(*RExC_parse))
12126 vFAIL("Reference to nonexistent group");
12128 vFAIL("Unterminated \\g... pattern");
12132 num = RExC_npar - num;
12134 vFAIL("Reference to nonexistent or unclosed group");
12138 num = S_backref_value(RExC_parse);
12139 /* bare \NNN might be backref or octal - if it is larger
12140 * than or equal RExC_npar then it is assumed to be an
12141 * octal escape. Note RExC_npar is +1 from the actual
12142 * number of parens. */
12143 /* Note we do NOT check if num == I32_MAX here, as that is
12144 * handled by the RExC_npar check */
12147 /* any numeric escape < 10 is always a backref */
12149 /* any numeric escape < RExC_npar is a backref */
12150 && num >= RExC_npar
12151 /* cannot be an octal escape if it starts with 8 */
12152 && *RExC_parse != '8'
12153 /* cannot be an octal escape it it starts with 9 */
12154 && *RExC_parse != '9'
12157 /* Probably not a backref, instead likely to be an
12158 * octal character escape, e.g. \35 or \777.
12159 * The above logic should make it obvious why using
12160 * octal escapes in patterns is problematic. - Yves */
12165 /* At this point RExC_parse points at a numeric escape like
12166 * \12 or \88 or something similar, which we should NOT treat
12167 * as an octal escape. It may or may not be a valid backref
12168 * escape. For instance \88888888 is unlikely to be a valid
12171 #ifdef RE_TRACK_PATTERN_OFFSETS
12172 char * const parse_start = RExC_parse - 1; /* MJD */
12174 while (isDIGIT(*RExC_parse))
12177 if (*RExC_parse != '}')
12178 vFAIL("Unterminated \\g{...} pattern");
12182 if (num > (I32)RExC_rx->nparens)
12183 vFAIL("Reference to nonexistent group");
12186 ret = reganode(pRExC_state,
12189 : (ASCII_FOLD_RESTRICTED)
12191 : (AT_LEAST_UNI_SEMANTICS)
12197 *flagp |= HASWIDTH;
12199 /* override incorrect value set in reganode MJD */
12200 Set_Node_Offset(ret, parse_start+1);
12201 Set_Node_Cur_Length(ret, parse_start);
12203 nextchar(pRExC_state);
12208 if (RExC_parse >= RExC_end)
12209 FAIL("Trailing \\");
12212 /* Do not generate "unrecognized" warnings here, we fall
12213 back into the quick-grab loop below */
12220 if (RExC_flags & RXf_PMf_EXTENDED) {
12221 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
12222 if (RExC_parse < RExC_end)
12229 parse_start = RExC_parse - 1;
12238 #define MAX_NODE_STRING_SIZE 127
12239 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
12241 U8 upper_parse = MAX_NODE_STRING_SIZE;
12242 U8 node_type = compute_EXACTish(pRExC_state);
12243 bool next_is_quantifier;
12244 char * oldp = NULL;
12246 /* We can convert EXACTF nodes to EXACTFU if they contain only
12247 * characters that match identically regardless of the target
12248 * string's UTF8ness. The reason to do this is that EXACTF is not
12249 * trie-able, EXACTFU is.
12251 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
12252 * contain only above-Latin1 characters (hence must be in UTF8),
12253 * which don't participate in folds with Latin1-range characters,
12254 * as the latter's folds aren't known until runtime. (We don't
12255 * need to figure this out until pass 2) */
12256 bool maybe_exactfu = PASS2
12257 && (node_type == EXACTF || node_type == EXACTFL);
12259 /* If a folding node contains only code points that don't
12260 * participate in folds, it can be changed into an EXACT node,
12261 * which allows the optimizer more things to look for */
12264 ret = reg_node(pRExC_state, node_type);
12266 /* In pass1, folded, we use a temporary buffer instead of the
12267 * actual node, as the node doesn't exist yet */
12268 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
12274 /* We do the EXACTFish to EXACT node only if folding. (And we
12275 * don't need to figure this out until pass 2) */
12276 maybe_exact = FOLD && PASS2;
12278 /* XXX The node can hold up to 255 bytes, yet this only goes to
12279 * 127. I (khw) do not know why. Keeping it somewhat less than
12280 * 255 allows us to not have to worry about overflow due to
12281 * converting to utf8 and fold expansion, but that value is
12282 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
12283 * split up by this limit into a single one using the real max of
12284 * 255. Even at 127, this breaks under rare circumstances. If
12285 * folding, we do not want to split a node at a character that is a
12286 * non-final in a multi-char fold, as an input string could just
12287 * happen to want to match across the node boundary. The join
12288 * would solve that problem if the join actually happens. But a
12289 * series of more than two nodes in a row each of 127 would cause
12290 * the first join to succeed to get to 254, but then there wouldn't
12291 * be room for the next one, which could at be one of those split
12292 * multi-char folds. I don't know of any fool-proof solution. One
12293 * could back off to end with only a code point that isn't such a
12294 * non-final, but it is possible for there not to be any in the
12296 for (p = RExC_parse - 1;
12297 len < upper_parse && p < RExC_end;
12302 if (RExC_flags & RXf_PMf_EXTENDED)
12303 p = regpatws(pRExC_state, p,
12304 TRUE); /* means recognize comments */
12315 /* Literal Escapes Switch
12317 This switch is meant to handle escape sequences that
12318 resolve to a literal character.
12320 Every escape sequence that represents something
12321 else, like an assertion or a char class, is handled
12322 in the switch marked 'Special Escapes' above in this
12323 routine, but also has an entry here as anything that
12324 isn't explicitly mentioned here will be treated as
12325 an unescaped equivalent literal.
12328 switch ((U8)*++p) {
12329 /* These are all the special escapes. */
12330 case 'A': /* Start assertion */
12331 case 'b': case 'B': /* Word-boundary assertion*/
12332 case 'C': /* Single char !DANGEROUS! */
12333 case 'd': case 'D': /* digit class */
12334 case 'g': case 'G': /* generic-backref, pos assertion */
12335 case 'h': case 'H': /* HORIZWS */
12336 case 'k': case 'K': /* named backref, keep marker */
12337 case 'p': case 'P': /* Unicode property */
12338 case 'R': /* LNBREAK */
12339 case 's': case 'S': /* space class */
12340 case 'v': case 'V': /* VERTWS */
12341 case 'w': case 'W': /* word class */
12342 case 'X': /* eXtended Unicode "combining
12343 character sequence" */
12344 case 'z': case 'Z': /* End of line/string assertion */
12348 /* Anything after here is an escape that resolves to a
12349 literal. (Except digits, which may or may not)
12355 case 'N': /* Handle a single-code point named character. */
12356 RExC_parse = p + 1;
12357 if (! grok_bslash_N(pRExC_state,
12358 NULL, /* Fail if evaluates to
12359 anything other than a
12360 single code point */
12361 &ender, /* The returned single code
12363 NULL, /* Don't need a count of
12364 how many code points */
12368 if (*flagp & RESTART_UTF8)
12369 FAIL("panic: grok_bslash_N set RESTART_UTF8");
12371 /* Here, it wasn't a single code point. Go close
12372 * up this EXACTish node. The switch() prior to
12373 * this switch handles the other cases */
12374 RExC_parse = p = oldp;
12378 if (ender > 0xff) {
12395 ender = ESC_NATIVE;
12405 const char* error_msg;
12407 bool valid = grok_bslash_o(&p,
12410 PASS2, /* out warnings */
12411 (bool) RExC_strict,
12412 TRUE, /* Output warnings
12417 RExC_parse = p; /* going to die anyway; point
12418 to exact spot of failure */
12422 if (IN_ENCODING && ender < 0x100) {
12423 goto recode_encoding;
12425 if (ender > 0xff) {
12432 UV result = UV_MAX; /* initialize to erroneous
12434 const char* error_msg;
12436 bool valid = grok_bslash_x(&p,
12439 PASS2, /* out warnings */
12440 (bool) RExC_strict,
12441 TRUE, /* Silence warnings
12446 RExC_parse = p; /* going to die anyway; point
12447 to exact spot of failure */
12452 if (ender < 0x100) {
12454 if (RExC_recode_x_to_native) {
12455 ender = LATIN1_TO_NATIVE(ender);
12460 goto recode_encoding;
12470 ender = grok_bslash_c(*p++, PASS2);
12472 case '8': case '9': /* must be a backreference */
12474 /* we have an escape like \8 which cannot be an octal escape
12475 * so we exit the loop, and let the outer loop handle this
12476 * escape which may or may not be a legitimate backref. */
12478 case '1': case '2': case '3':case '4':
12479 case '5': case '6': case '7':
12480 /* When we parse backslash escapes there is ambiguity
12481 * between backreferences and octal escapes. Any escape
12482 * from \1 - \9 is a backreference, any multi-digit
12483 * escape which does not start with 0 and which when
12484 * evaluated as decimal could refer to an already
12485 * parsed capture buffer is a back reference. Anything
12488 * Note this implies that \118 could be interpreted as
12489 * 118 OR as "\11" . "8" depending on whether there
12490 * were 118 capture buffers defined already in the
12493 /* NOTE, RExC_npar is 1 more than the actual number of
12494 * parens we have seen so far, hence the < RExC_npar below. */
12496 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12497 { /* Not to be treated as an octal constant, go
12505 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12507 ender = grok_oct(p, &numlen, &flags, NULL);
12508 if (ender > 0xff) {
12512 if (PASS2 /* like \08, \178 */
12515 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12517 reg_warn_non_literal_string(
12519 form_short_octal_warning(p, numlen));
12522 if (IN_ENCODING && ender < 0x100)
12523 goto recode_encoding;
12526 if (! RExC_override_recoding) {
12527 SV* enc = _get_encoding();
12528 ender = reg_recode((const char)(U8)ender, &enc);
12530 ckWARNreg(p, "Invalid escape in the specified encoding");
12536 FAIL("Trailing \\");
12539 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12540 /* Include any { following the alpha to emphasize
12541 * that it could be part of an escape at some point
12543 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12544 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12546 goto normal_default;
12547 } /* End of switch on '\' */
12550 /* Currently we don't warn when the lbrace is at the start
12551 * of a construct. This catches it in the middle of a
12552 * literal string, or when its the first thing after
12553 * something like "\b" */
12555 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12557 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12560 default: /* A literal character */
12562 if (UTF8_IS_START(*p) && UTF) {
12564 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12565 &numlen, UTF8_ALLOW_DEFAULT);
12571 } /* End of switch on the literal */
12573 /* Here, have looked at the literal character and <ender>
12574 * contains its ordinal, <p> points to the character after it
12577 if ( RExC_flags & RXf_PMf_EXTENDED)
12578 p = regpatws(pRExC_state, p,
12579 TRUE); /* means recognize comments */
12581 /* If the next thing is a quantifier, it applies to this
12582 * character only, which means that this character has to be in
12583 * its own node and can't just be appended to the string in an
12584 * existing node, so if there are already other characters in
12585 * the node, close the node with just them, and set up to do
12586 * this character again next time through, when it will be the
12587 * only thing in its new node */
12588 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12594 if (! FOLD) { /* The simple case, just append the literal */
12596 /* In the sizing pass, we need only the size of the
12597 * character we are appending, hence we can delay getting
12598 * its representation until PASS2. */
12601 const STRLEN unilen = UNISKIP(ender);
12604 /* We have to subtract 1 just below (and again in
12605 * the corresponding PASS2 code) because the loop
12606 * increments <len> each time, as all but this path
12607 * (and one other) through it add a single byte to
12608 * the EXACTish node. But these paths would change
12609 * len to be the correct final value, so cancel out
12610 * the increment that follows */
12616 } else { /* PASS2 */
12619 U8 * new_s = uvchr_to_utf8((U8*)s, ender);
12620 len += (char *) new_s - s - 1;
12621 s = (char *) new_s;
12624 *(s++) = (char) ender;
12628 else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
12630 /* Here are folding under /l, and the code point is
12631 * problematic. First, we know we can't simplify things */
12632 maybe_exact = FALSE;
12633 maybe_exactfu = FALSE;
12635 /* A problematic code point in this context means that its
12636 * fold isn't known until runtime, so we can't fold it now.
12637 * (The non-problematic code points are the above-Latin1
12638 * ones that fold to also all above-Latin1. Their folds
12639 * don't vary no matter what the locale is.) But here we
12640 * have characters whose fold depends on the locale.
12641 * Unlike the non-folding case above, we have to keep track
12642 * of these in the sizing pass, so that we can make sure we
12643 * don't split too-long nodes in the middle of a potential
12644 * multi-char fold. And unlike the regular fold case
12645 * handled in the else clauses below, we don't actually
12646 * fold and don't have special cases to consider. What we
12647 * do for both passes is the PASS2 code for non-folding */
12648 goto not_fold_common;
12650 else /* A regular FOLD code point */
12652 /* See comments for join_exact() as to why we fold this
12653 * non-UTF at compile time */
12654 || (node_type == EXACTFU
12655 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12657 /* Here, are folding and are not UTF-8 encoded; therefore
12658 * the character must be in the range 0-255, and is not /l
12659 * (Not /l because we already handled these under /l in
12660 * is_PROBLEMATIC_LOCALE_FOLD_cp) */
12661 if (IS_IN_SOME_FOLD_L1(ender)) {
12662 maybe_exact = FALSE;
12664 /* See if the character's fold differs between /d and
12665 * /u. This includes the multi-char fold SHARP S to
12668 && (PL_fold[ender] != PL_fold_latin1[ender]
12669 || ender == LATIN_SMALL_LETTER_SHARP_S
12671 && isALPHA_FOLD_EQ(ender, 's')
12672 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12674 maybe_exactfu = FALSE;
12678 /* Even when folding, we store just the input character, as
12679 * we have an array that finds its fold quickly */
12680 *(s++) = (char) ender;
12682 else { /* FOLD and UTF */
12683 /* Unlike the non-fold case, we do actually have to
12684 * calculate the results here in pass 1. This is for two
12685 * reasons, the folded length may be longer than the
12686 * unfolded, and we have to calculate how many EXACTish
12687 * nodes it will take; and we may run out of room in a node
12688 * in the middle of a potential multi-char fold, and have
12689 * to back off accordingly. */
12692 if (isASCII_uni(ender)) {
12693 folded = toFOLD(ender);
12694 *(s)++ = (U8) folded;
12699 folded = _to_uni_fold_flags(
12703 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12704 ? FOLD_FLAGS_NOMIX_ASCII
12708 /* The loop increments <len> each time, as all but this
12709 * path (and one other) through it add a single byte to
12710 * the EXACTish node. But this one has changed len to
12711 * be the correct final value, so subtract one to
12712 * cancel out the increment that follows */
12713 len += foldlen - 1;
12715 /* If this node only contains non-folding code points so
12716 * far, see if this new one is also non-folding */
12718 if (folded != ender) {
12719 maybe_exact = FALSE;
12722 /* Here the fold is the original; we have to check
12723 * further to see if anything folds to it */
12724 if (_invlist_contains_cp(PL_utf8_foldable,
12727 maybe_exact = FALSE;
12734 if (next_is_quantifier) {
12736 /* Here, the next input is a quantifier, and to get here,
12737 * the current character is the only one in the node.
12738 * Also, here <len> doesn't include the final byte for this
12744 } /* End of loop through literal characters */
12746 /* Here we have either exhausted the input or ran out of room in
12747 * the node. (If we encountered a character that can't be in the
12748 * node, transfer is made directly to <loopdone>, and so we
12749 * wouldn't have fallen off the end of the loop.) In the latter
12750 * case, we artificially have to split the node into two, because
12751 * we just don't have enough space to hold everything. This
12752 * creates a problem if the final character participates in a
12753 * multi-character fold in the non-final position, as a match that
12754 * should have occurred won't, due to the way nodes are matched,
12755 * and our artificial boundary. So back off until we find a non-
12756 * problematic character -- one that isn't at the beginning or
12757 * middle of such a fold. (Either it doesn't participate in any
12758 * folds, or appears only in the final position of all the folds it
12759 * does participate in.) A better solution with far fewer false
12760 * positives, and that would fill the nodes more completely, would
12761 * be to actually have available all the multi-character folds to
12762 * test against, and to back-off only far enough to be sure that
12763 * this node isn't ending with a partial one. <upper_parse> is set
12764 * further below (if we need to reparse the node) to include just
12765 * up through that final non-problematic character that this code
12766 * identifies, so when it is set to less than the full node, we can
12767 * skip the rest of this */
12768 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12770 const STRLEN full_len = len;
12772 assert(len >= MAX_NODE_STRING_SIZE);
12774 /* Here, <s> points to the final byte of the final character.
12775 * Look backwards through the string until find a non-
12776 * problematic character */
12780 /* This has no multi-char folds to non-UTF characters */
12781 if (ASCII_FOLD_RESTRICTED) {
12785 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12789 if (! PL_NonL1NonFinalFold) {
12790 PL_NonL1NonFinalFold = _new_invlist_C_array(
12791 NonL1_Perl_Non_Final_Folds_invlist);
12794 /* Point to the first byte of the final character */
12795 s = (char *) utf8_hop((U8 *) s, -1);
12797 while (s >= s0) { /* Search backwards until find
12798 non-problematic char */
12799 if (UTF8_IS_INVARIANT(*s)) {
12801 /* There are no ascii characters that participate
12802 * in multi-char folds under /aa. In EBCDIC, the
12803 * non-ascii invariants are all control characters,
12804 * so don't ever participate in any folds. */
12805 if (ASCII_FOLD_RESTRICTED
12806 || ! IS_NON_FINAL_FOLD(*s))
12811 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12812 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12818 else if (! _invlist_contains_cp(
12819 PL_NonL1NonFinalFold,
12820 valid_utf8_to_uvchr((U8 *) s, NULL)))
12825 /* Here, the current character is problematic in that
12826 * it does occur in the non-final position of some
12827 * fold, so try the character before it, but have to
12828 * special case the very first byte in the string, so
12829 * we don't read outside the string */
12830 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12831 } /* End of loop backwards through the string */
12833 /* If there were only problematic characters in the string,
12834 * <s> will point to before s0, in which case the length
12835 * should be 0, otherwise include the length of the
12836 * non-problematic character just found */
12837 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12840 /* Here, have found the final character, if any, that is
12841 * non-problematic as far as ending the node without splitting
12842 * it across a potential multi-char fold. <len> contains the
12843 * number of bytes in the node up-to and including that
12844 * character, or is 0 if there is no such character, meaning
12845 * the whole node contains only problematic characters. In
12846 * this case, give up and just take the node as-is. We can't
12851 /* If the node ends in an 's' we make sure it stays EXACTF,
12852 * as if it turns into an EXACTFU, it could later get
12853 * joined with another 's' that would then wrongly match
12855 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12857 maybe_exactfu = FALSE;
12861 /* Here, the node does contain some characters that aren't
12862 * problematic. If one such is the final character in the
12863 * node, we are done */
12864 if (len == full_len) {
12867 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12869 /* If the final character is problematic, but the
12870 * penultimate is not, back-off that last character to
12871 * later start a new node with it */
12876 /* Here, the final non-problematic character is earlier
12877 * in the input than the penultimate character. What we do
12878 * is reparse from the beginning, going up only as far as
12879 * this final ok one, thus guaranteeing that the node ends
12880 * in an acceptable character. The reason we reparse is
12881 * that we know how far in the character is, but we don't
12882 * know how to correlate its position with the input parse.
12883 * An alternate implementation would be to build that
12884 * correlation as we go along during the original parse,
12885 * but that would entail extra work for every node, whereas
12886 * this code gets executed only when the string is too
12887 * large for the node, and the final two characters are
12888 * problematic, an infrequent occurrence. Yet another
12889 * possible strategy would be to save the tail of the
12890 * string, and the next time regatom is called, initialize
12891 * with that. The problem with this is that unless you
12892 * back off one more character, you won't be guaranteed
12893 * regatom will get called again, unless regbranch,
12894 * regpiece ... are also changed. If you do back off that
12895 * extra character, so that there is input guaranteed to
12896 * force calling regatom, you can't handle the case where
12897 * just the first character in the node is acceptable. I
12898 * (khw) decided to try this method which doesn't have that
12899 * pitfall; if performance issues are found, we can do a
12900 * combination of the current approach plus that one */
12906 } /* End of verifying node ends with an appropriate char */
12908 loopdone: /* Jumped to when encounters something that shouldn't be
12911 /* I (khw) don't know if you can get here with zero length, but the
12912 * old code handled this situation by creating a zero-length EXACT
12913 * node. Might as well be NOTHING instead */
12919 /* If 'maybe_exact' is still set here, means there are no
12920 * code points in the node that participate in folds;
12921 * similarly for 'maybe_exactfu' and code points that match
12922 * differently depending on UTF8ness of the target string
12923 * (for /u), or depending on locale for /l */
12929 else if (maybe_exactfu) {
12935 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12936 FALSE /* Don't look to see if could
12937 be turned into an EXACT
12938 node, as we have already
12943 RExC_parse = p - 1;
12944 Set_Node_Cur_Length(ret, parse_start);
12945 nextchar(pRExC_state);
12947 /* len is STRLEN which is unsigned, need to copy to signed */
12950 vFAIL("Internal disaster");
12953 } /* End of label 'defchar:' */
12955 } /* End of giant switch on input character */
12961 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12963 /* Returns the next non-pattern-white space, non-comment character (the
12964 * latter only if 'recognize_comment is true) in the string p, which is
12965 * ended by RExC_end. See also reg_skipcomment */
12966 const char *e = RExC_end;
12968 PERL_ARGS_ASSERT_REGPATWS;
12972 if ((len = is_PATWS_safe(p, e, UTF))) {
12975 else if (recognize_comment && *p == '#') {
12976 p = reg_skipcomment(pRExC_state, p);
12985 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12987 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12988 * sets up the bitmap and any flags, removing those code points from the
12989 * inversion list, setting it to NULL should it become completely empty */
12991 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12992 assert(PL_regkind[OP(node)] == ANYOF);
12994 ANYOF_BITMAP_ZERO(node);
12995 if (*invlist_ptr) {
12997 /* This gets set if we actually need to modify things */
12998 bool change_invlist = FALSE;
13002 /* Start looking through *invlist_ptr */
13003 invlist_iterinit(*invlist_ptr);
13004 while (invlist_iternext(*invlist_ptr, &start, &end)) {
13008 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
13009 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
13011 else if (end >= NUM_ANYOF_CODE_POINTS) {
13012 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
13015 /* Quit if are above what we should change */
13016 if (start >= NUM_ANYOF_CODE_POINTS) {
13020 change_invlist = TRUE;
13022 /* Set all the bits in the range, up to the max that we are doing */
13023 high = (end < NUM_ANYOF_CODE_POINTS - 1)
13025 : NUM_ANYOF_CODE_POINTS - 1;
13026 for (i = start; i <= (int) high; i++) {
13027 if (! ANYOF_BITMAP_TEST(node, i)) {
13028 ANYOF_BITMAP_SET(node, i);
13032 invlist_iterfinish(*invlist_ptr);
13034 /* Done with loop; remove any code points that are in the bitmap from
13035 * *invlist_ptr; similarly for code points above the bitmap if we have
13036 * a flag to match all of them anyways */
13037 if (change_invlist) {
13038 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
13040 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
13041 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
13044 /* If have completely emptied it, remove it completely */
13045 if (_invlist_len(*invlist_ptr) == 0) {
13046 SvREFCNT_dec_NN(*invlist_ptr);
13047 *invlist_ptr = NULL;
13052 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
13053 Character classes ([:foo:]) can also be negated ([:^foo:]).
13054 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
13055 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
13056 but trigger failures because they are currently unimplemented. */
13058 #define POSIXCC_DONE(c) ((c) == ':')
13059 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
13060 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
13062 PERL_STATIC_INLINE I32
13063 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
13065 I32 namedclass = OOB_NAMEDCLASS;
13067 PERL_ARGS_ASSERT_REGPPOSIXCC;
13069 if (value == '[' && RExC_parse + 1 < RExC_end &&
13070 /* I smell either [: or [= or [. -- POSIX has been here, right? */
13071 POSIXCC(UCHARAT(RExC_parse)))
13073 const char c = UCHARAT(RExC_parse);
13074 char* const s = RExC_parse++;
13076 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
13078 if (RExC_parse == RExC_end) {
13081 /* Try to give a better location for the error (than the end of
13082 * the string) by looking for the matching ']' */
13084 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
13087 vFAIL2("Unmatched '%c' in POSIX class", c);
13089 /* Grandfather lone [:, [=, [. */
13093 const char* const t = RExC_parse++; /* skip over the c */
13096 if (UCHARAT(RExC_parse) == ']') {
13097 const char *posixcc = s + 1;
13098 RExC_parse++; /* skip over the ending ] */
13101 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
13102 const I32 skip = t - posixcc;
13104 /* Initially switch on the length of the name. */
13107 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
13108 this is the Perl \w
13110 namedclass = ANYOF_WORDCHAR;
13113 /* Names all of length 5. */
13114 /* alnum alpha ascii blank cntrl digit graph lower
13115 print punct space upper */
13116 /* Offset 4 gives the best switch position. */
13117 switch (posixcc[4]) {
13119 if (memEQ(posixcc, "alph", 4)) /* alpha */
13120 namedclass = ANYOF_ALPHA;
13123 if (memEQ(posixcc, "spac", 4)) /* space */
13124 namedclass = ANYOF_SPACE;
13127 if (memEQ(posixcc, "grap", 4)) /* graph */
13128 namedclass = ANYOF_GRAPH;
13131 if (memEQ(posixcc, "asci", 4)) /* ascii */
13132 namedclass = ANYOF_ASCII;
13135 if (memEQ(posixcc, "blan", 4)) /* blank */
13136 namedclass = ANYOF_BLANK;
13139 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
13140 namedclass = ANYOF_CNTRL;
13143 if (memEQ(posixcc, "alnu", 4)) /* alnum */
13144 namedclass = ANYOF_ALPHANUMERIC;
13147 if (memEQ(posixcc, "lowe", 4)) /* lower */
13148 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
13149 else if (memEQ(posixcc, "uppe", 4)) /* upper */
13150 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
13153 if (memEQ(posixcc, "digi", 4)) /* digit */
13154 namedclass = ANYOF_DIGIT;
13155 else if (memEQ(posixcc, "prin", 4)) /* print */
13156 namedclass = ANYOF_PRINT;
13157 else if (memEQ(posixcc, "punc", 4)) /* punct */
13158 namedclass = ANYOF_PUNCT;
13163 if (memEQ(posixcc, "xdigit", 6))
13164 namedclass = ANYOF_XDIGIT;
13168 if (namedclass == OOB_NAMEDCLASS)
13170 "POSIX class [:%"UTF8f":] unknown",
13171 UTF8fARG(UTF, t - s - 1, s + 1));
13173 /* The #defines are structured so each complement is +1 to
13174 * the normal one */
13178 assert (posixcc[skip] == ':');
13179 assert (posixcc[skip+1] == ']');
13180 } else if (!SIZE_ONLY) {
13181 /* [[=foo=]] and [[.foo.]] are still future. */
13183 /* adjust RExC_parse so the warning shows after
13184 the class closes */
13185 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
13187 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
13190 /* Maternal grandfather:
13191 * "[:" ending in ":" but not in ":]" */
13193 vFAIL("Unmatched '[' in POSIX class");
13196 /* Grandfather lone [:, [=, [. */
13206 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
13208 /* This applies some heuristics at the current parse position (which should
13209 * be at a '[') to see if what follows might be intended to be a [:posix:]
13210 * class. It returns true if it really is a posix class, of course, but it
13211 * also can return true if it thinks that what was intended was a posix
13212 * class that didn't quite make it.
13214 * It will return true for
13216 * [:alphanumerics] (as long as the ] isn't followed immediately by a
13217 * ')' indicating the end of the (?[
13218 * [:any garbage including %^&$ punctuation:]
13220 * This is designed to be called only from S_handle_regex_sets; it could be
13221 * easily adapted to be called from the spot at the beginning of regclass()
13222 * that checks to see in a normal bracketed class if the surrounding []
13223 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
13224 * change long-standing behavior, so I (khw) didn't do that */
13225 char* p = RExC_parse + 1;
13226 char first_char = *p;
13228 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
13230 assert(*(p - 1) == '[');
13232 if (! POSIXCC(first_char)) {
13237 while (p < RExC_end && isWORDCHAR(*p)) p++;
13239 if (p >= RExC_end) {
13243 if (p - RExC_parse > 2 /* Got at least 1 word character */
13244 && (*p == first_char
13245 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
13250 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
13253 && p - RExC_parse > 2 /* [:] evaluates to colon;
13254 [::] is a bad posix class. */
13255 && first_char == *(p - 1));
13258 STATIC unsigned int
13259 S_regex_set_precedence(const U8 my_operator) {
13261 /* Returns the precedence in the (?[...]) construct of the input operator,
13262 * specified by its character representation. The precedence follows
13263 * general Perl rules, but it extends this so that ')' and ']' have (low)
13264 * precedence even though they aren't really operators */
13266 switch (my_operator) {
13282 NOT_REACHED; /* NOTREACHED */
13283 return 0; /* Silence compiler warning */
13287 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
13288 I32 *flagp, U32 depth,
13289 char * const oregcomp_parse)
13291 /* Handle the (?[...]) construct to do set operations */
13293 U8 curchar; /* Current character being parsed */
13294 UV start, end; /* End points of code point ranges */
13295 SV* final = NULL; /* The end result inversion list */
13296 SV* result_string; /* 'final' stringified */
13297 AV* stack; /* stack of operators and operands not yet
13299 AV* fence_stack = NULL; /* A stack containing the positions in
13300 'stack' of where the undealt-with left
13301 parens would be if they were actually
13303 IV fence = 0; /* Position of where most recent undealt-
13304 with left paren in stack is; -1 if none.
13306 STRLEN len; /* Temporary */
13307 regnode* node; /* Temporary, and final regnode returned by
13309 const bool save_fold = FOLD; /* Temporary */
13310 char *save_end, *save_parse; /* Temporaries */
13312 GET_RE_DEBUG_FLAGS_DECL;
13314 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
13316 if (LOC) { /* XXX could make valid in UTF-8 locales */
13317 vFAIL("(?[...]) not valid in locale");
13319 RExC_uni_semantics = 1; /* The use of this operator implies /u. This
13320 is required so that the compile time values
13321 are valid in all runtime cases */
13323 /* This will return only an ANYOF regnode, or (unlikely) something smaller
13324 * (such as EXACT). Thus we can skip most everything if just sizing. We
13325 * call regclass to handle '[]' so as to not have to reinvent its parsing
13326 * rules here (throwing away the size it computes each time). And, we exit
13327 * upon an unescaped ']' that isn't one ending a regclass. To do both
13328 * these things, we need to realize that something preceded by a backslash
13329 * is escaped, so we have to keep track of backslashes */
13331 UV depth = 0; /* how many nested (?[...]) constructs */
13333 while (RExC_parse < RExC_end) {
13334 SV* current = NULL;
13335 RExC_parse = regpatws(pRExC_state, RExC_parse,
13336 TRUE); /* means recognize comments */
13337 switch (*RExC_parse) {
13339 if (RExC_parse[1] == '[') depth++, RExC_parse++;
13344 /* Skip the next byte (which could cause us to end up in
13345 * the middle of a UTF-8 character, but since none of those
13346 * are confusable with anything we currently handle in this
13347 * switch (invariants all), it's safe. We'll just hit the
13348 * default: case next time and keep on incrementing until
13349 * we find one of the invariants we do handle. */
13354 /* If this looks like it is a [:posix:] class, leave the
13355 * parse pointer at the '[' to fool regclass() into
13356 * thinking it is part of a '[[:posix:]]'. That function
13357 * will use strict checking to force a syntax error if it
13358 * doesn't work out to a legitimate class */
13359 bool is_posix_class
13360 = could_it_be_a_POSIX_class(pRExC_state);
13361 if (! is_posix_class) {
13365 /* regclass() can only return RESTART_UTF8 if multi-char
13366 folds are allowed. */
13367 if (!regclass(pRExC_state, flagp,depth+1,
13368 is_posix_class, /* parse the whole char
13369 class only if not a
13371 FALSE, /* don't allow multi-char folds */
13372 TRUE, /* silence non-portable warnings. */
13376 FAIL2("panic: regclass returned NULL to handle_sets, "
13377 "flags=%#"UVxf"", (UV) *flagp);
13379 /* function call leaves parse pointing to the ']', except
13380 * if we faked it */
13381 if (is_posix_class) {
13385 SvREFCNT_dec(current); /* In case it returned something */
13390 if (depth--) break;
13392 if (RExC_parse < RExC_end
13393 && *RExC_parse == ')')
13395 node = reganode(pRExC_state, ANYOF, 0);
13396 RExC_size += ANYOF_SKIP;
13397 nextchar(pRExC_state);
13398 Set_Node_Length(node,
13399 RExC_parse - oregcomp_parse + 1); /* MJD */
13408 FAIL("Syntax error in (?[...])");
13411 /* Pass 2 only after this. */
13412 Perl_ck_warner_d(aTHX_
13413 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
13414 "The regex_sets feature is experimental" REPORT_LOCATION,
13415 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
13417 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
13418 RExC_precomp + (RExC_parse - RExC_precomp)));
13420 /* Everything in this construct is a metacharacter. Operands begin with
13421 * either a '\' (for an escape sequence), or a '[' for a bracketed
13422 * character class. Any other character should be an operator, or
13423 * parenthesis for grouping. Both types of operands are handled by calling
13424 * regclass() to parse them. It is called with a parameter to indicate to
13425 * return the computed inversion list. The parsing here is implemented via
13426 * a stack. Each entry on the stack is a single character representing one
13427 * of the operators; or else a pointer to an operand inversion list. */
13429 #define IS_OPERAND(a) (! SvIOK(a))
13431 /* The stack is kept in Łukasiewicz order. (That's pronounced similar
13432 * to luke-a-shave-itch (or -itz), but people who didn't want to bother
13433 * with prounouncing it called it Reverse Polish instead, but now that YOU
13434 * know how to prounounce it you can use the correct term, thus giving due
13435 * credit to the person who invented it, and impressing your geek friends.
13436 * Wikipedia says that the pronounciation of "Ł" has been changing so that
13437 * it is now more like an English initial W (as in wonk) than an L.)
13439 * This means that, for example, 'a | b & c' is stored on the stack as
13447 * where the numbers in brackets give the stack [array] element number.
13448 * In this implementation, parentheses are not stored on the stack.
13449 * Instead a '(' creates a "fence" so that the part of the stack below the
13450 * fence is invisible except to the corresponding ')' (this allows us to
13451 * replace testing for parens, by using instead subtraction of the fence
13452 * position). As new operands are processed they are pushed onto the stack
13453 * (except as noted in the next paragraph). New operators of higher
13454 * precedence than the current final one are inserted on the stack before
13455 * the lhs operand (so that when the rhs is pushed next, everything will be
13456 * in the correct positions shown above. When an operator of equal or
13457 * lower precedence is encountered in parsing, all the stacked operations
13458 * of equal or higher precedence are evaluated, leaving the result as the
13459 * top entry on the stack. This makes higher precedence operations
13460 * evaluate before lower precedence ones, and causes operations of equal
13461 * precedence to left associate.
13463 * The only unary operator '!' is immediately pushed onto the stack when
13464 * encountered. When an operand is encountered, if the top of the stack is
13465 * a '!", the complement is immediately performed, and the '!' popped. The
13466 * resulting value is treated as a new operand, and the logic in the
13467 * previous paragraph is executed. Thus in the expression
13469 * the stack looks like
13475 * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
13482 * A ')' is treated as an operator with lower precedence than all the
13483 * aforementioned ones, which causes all operations on the stack above the
13484 * corresponding '(' to be evaluated down to a single resultant operand.
13485 * Then the fence for the '(' is removed, and the operand goes through the
13486 * algorithm above, without the fence.
13488 * A separate stack is kept of the fence positions, so that the position of
13489 * the latest so-far unbalanced '(' is at the top of it.
13491 * The ']' ending the construct is treated as the lowest operator of all,
13492 * so that everything gets evaluated down to a single operand, which is the
13495 sv_2mortal((SV *)(stack = newAV()));
13496 sv_2mortal((SV *)(fence_stack = newAV()));
13498 while (RExC_parse < RExC_end) {
13499 I32 top_index; /* Index of top-most element in 'stack' */
13500 SV** top_ptr; /* Pointer to top 'stack' element */
13501 SV* current = NULL; /* To contain the current inversion list
13503 SV* only_to_avoid_leaks;
13505 /* Skip white space */
13506 RExC_parse = regpatws(pRExC_state, RExC_parse,
13507 TRUE /* means recognize comments */ );
13508 if (RExC_parse >= RExC_end) {
13509 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
13512 curchar = UCHARAT(RExC_parse);
13516 top_index = av_tindex(stack);
13519 SV** stacked_ptr; /* Ptr to something already on 'stack' */
13520 char stacked_operator; /* The topmost operator on the 'stack'. */
13521 SV* lhs; /* Operand to the left of the operator */
13522 SV* rhs; /* Operand to the right of the operator */
13523 SV* fence_ptr; /* Pointer to top element of the fence
13528 if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
13530 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
13531 * This happens when we have some thing like
13533 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
13535 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
13537 * Here we would be handling the interpolated
13538 * '$thai_or_lao'. We handle this by a recursive call to
13539 * ourselves which returns the inversion list the
13540 * interpolated expression evaluates to. We use the flags
13541 * from the interpolated pattern. */
13542 U32 save_flags = RExC_flags;
13543 const char * save_parse;
13545 RExC_parse += 2; /* Skip past the '(?' */
13546 save_parse = RExC_parse;
13548 /* Parse any flags for the '(?' */
13549 parse_lparen_question_flags(pRExC_state);
13551 if (RExC_parse == save_parse /* Makes sure there was at
13552 least one flag (or else
13553 this embedding wasn't
13555 || RExC_parse >= RExC_end - 4
13556 || UCHARAT(RExC_parse) != ':'
13557 || UCHARAT(++RExC_parse) != '('
13558 || UCHARAT(++RExC_parse) != '?'
13559 || UCHARAT(++RExC_parse) != '[')
13562 /* In combination with the above, this moves the
13563 * pointer to the point just after the first erroneous
13564 * character (or if there are no flags, to where they
13565 * should have been) */
13566 if (RExC_parse >= RExC_end - 4) {
13567 RExC_parse = RExC_end;
13569 else if (RExC_parse != save_parse) {
13570 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13572 vFAIL("Expecting '(?flags:(?[...'");
13575 /* Recurse, with the meat of the embedded expression */
13577 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13578 depth+1, oregcomp_parse);
13580 /* Here, 'current' contains the embedded expression's
13581 * inversion list, and RExC_parse points to the trailing
13582 * ']'; the next character should be the ')' */
13584 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13586 /* Then the ')' matching the original '(' handled by this
13587 * case: statement */
13589 assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
13592 RExC_flags = save_flags;
13593 goto handle_operand;
13596 /* A regular '('. Look behind for illegal syntax */
13597 if (top_index - fence >= 0) {
13598 /* If the top entry on the stack is an operator, it had
13599 * better be a '!', otherwise the entry below the top
13600 * operand should be an operator */
13601 if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
13602 || (! IS_OPERAND(*top_ptr) && SvUV(*top_ptr) != '!')
13603 || top_index - fence < 1
13604 || ! (stacked_ptr = av_fetch(stack,
13607 || IS_OPERAND(*stacked_ptr))
13610 vFAIL("Unexpected '(' with no preceding operator");
13614 /* Stack the position of this undealt-with left paren */
13615 fence = top_index + 1;
13616 av_push(fence_stack, newSViv(fence));
13620 /* regclass() can only return RESTART_UTF8 if multi-char
13621 folds are allowed. */
13622 if (!regclass(pRExC_state, flagp,depth+1,
13623 TRUE, /* means parse just the next thing */
13624 FALSE, /* don't allow multi-char folds */
13625 FALSE, /* don't silence non-portable warnings. */
13629 FAIL2("panic: regclass returned NULL to handle_sets, "
13630 "flags=%#"UVxf"", (UV) *flagp);
13633 /* regclass() will return with parsing just the \ sequence,
13634 * leaving the parse pointer at the next thing to parse */
13636 goto handle_operand;
13638 case '[': /* Is a bracketed character class */
13640 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13642 if (! is_posix_class) {
13646 /* regclass() can only return RESTART_UTF8 if multi-char
13647 folds are allowed. */
13648 if(!regclass(pRExC_state, flagp,depth+1,
13649 is_posix_class, /* parse the whole char class
13650 only if not a posix class */
13651 FALSE, /* don't allow multi-char folds */
13652 FALSE, /* don't silence non-portable warnings. */
13657 FAIL2("panic: regclass returned NULL to handle_sets, "
13658 "flags=%#"UVxf"", (UV) *flagp);
13661 /* function call leaves parse pointing to the ']', except if we
13663 if (is_posix_class) {
13667 goto handle_operand;
13671 if (top_index >= 1) {
13672 goto join_operators;
13675 /* Only a single operand on the stack: are done */
13679 if (av_tindex(fence_stack) < 0) {
13681 vFAIL("Unexpected ')'");
13684 /* If at least two thing on the stack, treat this as an
13686 if (top_index - fence >= 1) {
13687 goto join_operators;
13690 /* Here only a single thing on the fenced stack, and there is a
13691 * fence. Get rid of it */
13692 fence_ptr = av_pop(fence_stack);
13694 fence = SvIV(fence_ptr) - 1;
13695 SvREFCNT_dec_NN(fence_ptr);
13702 /* Having gotten rid of the fence, we pop the operand at the
13703 * stack top and process it as a newly encountered operand */
13704 current = av_pop(stack);
13705 assert(IS_OPERAND(current));
13706 goto handle_operand;
13714 /* These binary operators should have a left operand already
13716 if ( top_index - fence < 0
13717 || top_index - fence == 1
13718 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13719 || ! IS_OPERAND(*top_ptr))
13721 goto unexpected_binary;
13724 /* If only the one operand is on the part of the stack visible
13725 * to us, we just place this operator in the proper position */
13726 if (top_index - fence < 2) {
13728 /* Place the operator before the operand */
13730 SV* lhs = av_pop(stack);
13731 av_push(stack, newSVuv(curchar));
13732 av_push(stack, lhs);
13736 /* But if there is something else on the stack, we need to
13737 * process it before this new operator if and only if the
13738 * stacked operation has equal or higher precedence than the
13743 /* The operator on the stack is supposed to be below both its
13745 if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
13746 || IS_OPERAND(*stacked_ptr))
13748 /* But if not, it's legal and indicates we are completely
13749 * done if and only if we're currently processing a ']',
13750 * which should be the final thing in the expression */
13751 if (curchar == ']') {
13757 vFAIL2("Unexpected binary operator '%c' with no "
13758 "preceding operand", curchar);
13760 stacked_operator = (char) SvUV(*stacked_ptr);
13762 if (regex_set_precedence(curchar)
13763 > regex_set_precedence(stacked_operator))
13765 /* Here, the new operator has higher precedence than the
13766 * stacked one. This means we need to add the new one to
13767 * the stack to await its rhs operand (and maybe more
13768 * stuff). We put it before the lhs operand, leaving
13769 * untouched the stacked operator and everything below it
13771 lhs = av_pop(stack);
13772 assert(IS_OPERAND(lhs));
13774 av_push(stack, newSVuv(curchar));
13775 av_push(stack, lhs);
13779 /* Here, the new operator has equal or lower precedence than
13780 * what's already there. This means the operation already
13781 * there should be performed now, before the new one. */
13782 rhs = av_pop(stack);
13783 lhs = av_pop(stack);
13785 assert(IS_OPERAND(rhs));
13786 assert(IS_OPERAND(lhs));
13788 switch (stacked_operator) {
13790 _invlist_intersection(lhs, rhs, &rhs);
13795 _invlist_union(lhs, rhs, &rhs);
13799 _invlist_subtract(lhs, rhs, &rhs);
13802 case '^': /* The union minus the intersection */
13808 _invlist_union(lhs, rhs, &u);
13809 _invlist_intersection(lhs, rhs, &i);
13810 /* _invlist_subtract will overwrite rhs
13811 without freeing what it already contains */
13813 _invlist_subtract(u, i, &rhs);
13814 SvREFCNT_dec_NN(i);
13815 SvREFCNT_dec_NN(u);
13816 SvREFCNT_dec_NN(element);
13822 /* Here, the higher precedence operation has been done, and the
13823 * result is in 'rhs'. We overwrite the stacked operator with
13824 * the result. Then we redo this code to either push the new
13825 * operator onto the stack or perform any higher precedence
13826 * stacked operation */
13827 only_to_avoid_leaks = av_pop(stack);
13828 SvREFCNT_dec(only_to_avoid_leaks);
13829 av_push(stack, rhs);
13832 case '!': /* Highest priority, right associative, so just push
13834 av_push(stack, newSVuv(curchar));
13838 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13839 vFAIL("Unexpected character");
13843 /* Here 'current' is the operand. If something is already on the
13844 * stack, we have to check if it is a !. */
13845 top_index = av_tindex(stack); /* Code above may have altered the
13846 * stack in the time since we
13847 * earlier set 'top_index'. */
13848 if (top_index - fence >= 0) {
13849 /* If the top entry on the stack is an operator, it had better
13850 * be a '!', otherwise the entry below the top operand should
13851 * be an operator */
13852 top_ptr = av_fetch(stack, top_index, FALSE);
13854 if (! IS_OPERAND(*top_ptr)) {
13856 /* The only permissible operator at the top of the stack is
13857 * '!', which is applied immediately to this operand. */
13858 curchar = (char) SvUV(*top_ptr);
13859 if (curchar != '!') {
13860 SvREFCNT_dec(current);
13861 vFAIL2("Unexpected binary operator '%c' with no "
13862 "preceding operand", curchar);
13865 _invlist_invert(current);
13867 only_to_avoid_leaks = av_pop(stack);
13868 SvREFCNT_dec(only_to_avoid_leaks);
13869 top_index = av_tindex(stack);
13871 /* And we redo with the inverted operand. This allows
13872 * handling multiple ! in a row */
13873 goto handle_operand;
13875 /* Single operand is ok only for the non-binary ')'
13877 else if ((top_index - fence == 0 && curchar != ')')
13878 || (top_index - fence > 0
13879 && (! (stacked_ptr = av_fetch(stack,
13882 || IS_OPERAND(*stacked_ptr))))
13884 SvREFCNT_dec(current);
13885 vFAIL("Operand with no preceding operator");
13889 /* Here there was nothing on the stack or the top element was
13890 * another operand. Just add this new one */
13891 av_push(stack, current);
13893 } /* End of switch on next parse token */
13895 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13896 } /* End of loop parsing through the construct */
13899 if (av_tindex(fence_stack) >= 0) {
13900 vFAIL("Unmatched (");
13903 if (av_tindex(stack) < 0 /* Was empty */
13904 || ((final = av_pop(stack)) == NULL)
13905 || ! IS_OPERAND(final)
13906 || av_tindex(stack) >= 0) /* More left on stack */
13908 SvREFCNT_dec(final);
13909 vFAIL("Incomplete expression within '(?[ ])'");
13912 /* Here, 'final' is the resultant inversion list from evaluating the
13913 * expression. Return it if so requested */
13914 if (return_invlist) {
13915 *return_invlist = final;
13919 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13920 * expecting a string of ranges and individual code points */
13921 invlist_iterinit(final);
13922 result_string = newSVpvs("");
13923 while (invlist_iternext(final, &start, &end)) {
13924 if (start == end) {
13925 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13928 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13933 /* About to generate an ANYOF (or similar) node from the inversion list we
13934 * have calculated */
13935 save_parse = RExC_parse;
13936 RExC_parse = SvPV(result_string, len);
13937 save_end = RExC_end;
13938 RExC_end = RExC_parse + len;
13940 /* We turn off folding around the call, as the class we have constructed
13941 * already has all folding taken into consideration, and we don't want
13942 * regclass() to add to that */
13943 RExC_flags &= ~RXf_PMf_FOLD;
13944 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13946 node = regclass(pRExC_state, flagp,depth+1,
13947 FALSE, /* means parse the whole char class */
13948 FALSE, /* don't allow multi-char folds */
13949 TRUE, /* silence non-portable warnings. The above may very
13950 well have generated non-portable code points, but
13951 they're valid on this machine */
13952 FALSE, /* similarly, no need for strict */
13956 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13959 RExC_flags |= RXf_PMf_FOLD;
13961 RExC_parse = save_parse + 1;
13962 RExC_end = save_end;
13963 SvREFCNT_dec_NN(final);
13964 SvREFCNT_dec_NN(result_string);
13966 nextchar(pRExC_state);
13967 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13973 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13975 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13976 * innocent-looking character class, like /[ks]/i won't have to go out to
13977 * disk to find the possible matches.
13979 * This should be called only for a Latin1-range code points, cp, which is
13980 * known to be involved in a simple fold with other code points above
13981 * Latin1. It would give false results if /aa has been specified.
13982 * Multi-char folds are outside the scope of this, and must be handled
13985 * XXX It would be better to generate these via regen, in case a new
13986 * version of the Unicode standard adds new mappings, though that is not
13987 * really likely, and may be caught by the default: case of the switch
13990 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13992 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13998 add_cp_to_invlist(*invlist, KELVIN_SIGN);
14002 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
14005 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
14006 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
14008 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
14009 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
14010 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
14012 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
14013 *invlist = add_cp_to_invlist(*invlist,
14014 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
14016 case LATIN_SMALL_LETTER_SHARP_S:
14017 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
14020 /* Use deprecated warning to increase the chances of this being
14023 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
14030 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
14032 /* This adds the string scalar <multi_string> to the array
14033 * <multi_char_matches>. <multi_string> is known to have exactly
14034 * <cp_count> code points in it. This is used when constructing a
14035 * bracketed character class and we find something that needs to match more
14036 * than a single character.
14038 * <multi_char_matches> is actually an array of arrays. Each top-level
14039 * element is an array that contains all the strings known so far that are
14040 * the same length. And that length (in number of code points) is the same
14041 * as the index of the top-level array. Hence, the [2] element is an
14042 * array, each element thereof is a string containing TWO code points;
14043 * while element [3] is for strings of THREE characters, and so on. Since
14044 * this is for multi-char strings there can never be a [0] nor [1] element.
14046 * When we rewrite the character class below, we will do so such that the
14047 * longest strings are written first, so that it prefers the longest
14048 * matching strings first. This is done even if it turns out that any
14049 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
14050 * Christiansen has agreed that this is ok. This makes the test for the
14051 * ligature 'ffi' come before the test for 'ff', for example */
14054 AV** this_array_ptr;
14056 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
14058 if (! multi_char_matches) {
14059 multi_char_matches = newAV();
14062 if (av_exists(multi_char_matches, cp_count)) {
14063 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
14064 this_array = *this_array_ptr;
14067 this_array = newAV();
14068 av_store(multi_char_matches, cp_count,
14071 av_push(this_array, multi_string);
14073 return multi_char_matches;
14076 /* The names of properties whose definitions are not known at compile time are
14077 * stored in this SV, after a constant heading. So if the length has been
14078 * changed since initialization, then there is a run-time definition. */
14079 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
14080 (SvCUR(listsv) != initial_listsv_len)
14083 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
14084 const bool stop_at_1, /* Just parse the next thing, don't
14085 look for a full character class */
14086 bool allow_multi_folds,
14087 const bool silence_non_portable, /* Don't output warnings
14091 SV** ret_invlist /* Return an inversion list, not a node */
14094 /* parse a bracketed class specification. Most of these will produce an
14095 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
14096 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
14097 * under /i with multi-character folds: it will be rewritten following the
14098 * paradigm of this example, where the <multi-fold>s are characters which
14099 * fold to multiple character sequences:
14100 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
14101 * gets effectively rewritten as:
14102 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
14103 * reg() gets called (recursively) on the rewritten version, and this
14104 * function will return what it constructs. (Actually the <multi-fold>s
14105 * aren't physically removed from the [abcdefghi], it's just that they are
14106 * ignored in the recursion by means of a flag:
14107 * <RExC_in_multi_char_class>.)
14109 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
14110 * characters, with the corresponding bit set if that character is in the
14111 * list. For characters above this, a range list or swash is used. There
14112 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
14113 * determinable at compile time
14115 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
14116 * to be restarted. This can only happen if ret_invlist is non-NULL.
14119 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
14121 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
14124 IV namedclass = OOB_NAMEDCLASS;
14125 char *rangebegin = NULL;
14126 bool need_class = 0;
14128 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
14129 than just initialized. */
14130 SV* properties = NULL; /* Code points that match \p{} \P{} */
14131 SV* posixes = NULL; /* Code points that match classes like [:word:],
14132 extended beyond the Latin1 range. These have to
14133 be kept separate from other code points for much
14134 of this function because their handling is
14135 different under /i, and for most classes under
14137 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
14138 separate for a while from the non-complemented
14139 versions because of complications with /d
14141 SV* simple_posixes = NULL; /* But under some conditions, the classes can be
14142 treated more simply than the general case,
14143 leading to less compilation and execution
14145 UV element_count = 0; /* Number of distinct elements in the class.
14146 Optimizations may be possible if this is tiny */
14147 AV * multi_char_matches = NULL; /* Code points that fold to more than one
14148 character; used under /i */
14150 char * stop_ptr = RExC_end; /* where to stop parsing */
14151 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
14154 /* Unicode properties are stored in a swash; this holds the current one
14155 * being parsed. If this swash is the only above-latin1 component of the
14156 * character class, an optimization is to pass it directly on to the
14157 * execution engine. Otherwise, it is set to NULL to indicate that there
14158 * are other things in the class that have to be dealt with at execution
14160 SV* swash = NULL; /* Code points that match \p{} \P{} */
14162 /* Set if a component of this character class is user-defined; just passed
14163 * on to the engine */
14164 bool has_user_defined_property = FALSE;
14166 /* inversion list of code points this node matches only when the target
14167 * string is in UTF-8. (Because is under /d) */
14168 SV* depends_list = NULL;
14170 /* Inversion list of code points this node matches regardless of things
14171 * like locale, folding, utf8ness of the target string */
14172 SV* cp_list = NULL;
14174 /* Like cp_list, but code points on this list need to be checked for things
14175 * that fold to/from them under /i */
14176 SV* cp_foldable_list = NULL;
14178 /* Like cp_list, but code points on this list are valid only when the
14179 * runtime locale is UTF-8 */
14180 SV* only_utf8_locale_list = NULL;
14182 /* In a range, if one of the endpoints is non-character-set portable,
14183 * meaning that it hard-codes a code point that may mean a different
14184 * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
14185 * mnemonic '\t' which each mean the same character no matter which
14186 * character set the platform is on. */
14187 unsigned int non_portable_endpoint = 0;
14189 /* Is the range unicode? which means on a platform that isn't 1-1 native
14190 * to Unicode (i.e. non-ASCII), each code point in it should be considered
14191 * to be a Unicode value. */
14192 bool unicode_range = FALSE;
14193 bool invert = FALSE; /* Is this class to be complemented */
14195 bool warn_super = ALWAYS_WARN_SUPER;
14197 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
14198 case we need to change the emitted regop to an EXACT. */
14199 const char * orig_parse = RExC_parse;
14200 const SSize_t orig_size = RExC_size;
14201 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
14202 GET_RE_DEBUG_FLAGS_DECL;
14204 PERL_ARGS_ASSERT_REGCLASS;
14206 PERL_UNUSED_ARG(depth);
14209 DEBUG_PARSE("clas");
14211 /* Assume we are going to generate an ANYOF node. */
14212 ret = reganode(pRExC_state,
14219 RExC_size += ANYOF_SKIP;
14220 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
14223 ANYOF_FLAGS(ret) = 0;
14225 RExC_emit += ANYOF_SKIP;
14226 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
14227 initial_listsv_len = SvCUR(listsv);
14228 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
14232 RExC_parse = regpatws(pRExC_state, RExC_parse,
14233 FALSE /* means don't recognize comments */ );
14236 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
14239 allow_multi_folds = FALSE;
14242 RExC_parse = regpatws(pRExC_state, RExC_parse,
14243 FALSE /* means don't recognize comments */ );
14247 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
14248 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
14249 const char *s = RExC_parse;
14250 const char c = *s++;
14255 while (isWORDCHAR(*s))
14257 if (*s && c == *s && s[1] == ']') {
14258 SAVEFREESV(RExC_rx_sv);
14260 "POSIX syntax [%c %c] belongs inside character classes",
14262 (void)ReREFCNT_inc(RExC_rx_sv);
14266 /* If the caller wants us to just parse a single element, accomplish this
14267 * by faking the loop ending condition */
14268 if (stop_at_1 && RExC_end > RExC_parse) {
14269 stop_ptr = RExC_parse + 1;
14272 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
14273 if (UCHARAT(RExC_parse) == ']')
14274 goto charclassloop;
14277 if (RExC_parse >= stop_ptr) {
14282 RExC_parse = regpatws(pRExC_state, RExC_parse,
14283 FALSE /* means don't recognize comments */ );
14286 if (UCHARAT(RExC_parse) == ']') {
14292 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
14293 save_value = value;
14294 save_prevvalue = prevvalue;
14297 rangebegin = RExC_parse;
14299 non_portable_endpoint = 0;
14302 value = utf8n_to_uvchr((U8*)RExC_parse,
14303 RExC_end - RExC_parse,
14304 &numlen, UTF8_ALLOW_DEFAULT);
14305 RExC_parse += numlen;
14308 value = UCHARAT(RExC_parse++);
14311 && RExC_parse < RExC_end
14312 && POSIXCC(UCHARAT(RExC_parse)))
14314 namedclass = regpposixcc(pRExC_state, value, strict);
14316 else if (value == '\\') {
14317 /* Is a backslash; get the code point of the char after it */
14318 if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
14319 value = utf8n_to_uvchr((U8*)RExC_parse,
14320 RExC_end - RExC_parse,
14321 &numlen, UTF8_ALLOW_DEFAULT);
14322 RExC_parse += numlen;
14325 value = UCHARAT(RExC_parse++);
14327 /* Some compilers cannot handle switching on 64-bit integer
14328 * values, therefore value cannot be an UV. Yes, this will
14329 * be a problem later if we want switch on Unicode.
14330 * A similar issue a little bit later when switching on
14331 * namedclass. --jhi */
14333 /* If the \ is escaping white space when white space is being
14334 * skipped, it means that that white space is wanted literally, and
14335 * is already in 'value'. Otherwise, need to translate the escape
14336 * into what it signifies. */
14337 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
14339 case 'w': namedclass = ANYOF_WORDCHAR; break;
14340 case 'W': namedclass = ANYOF_NWORDCHAR; break;
14341 case 's': namedclass = ANYOF_SPACE; break;
14342 case 'S': namedclass = ANYOF_NSPACE; break;
14343 case 'd': namedclass = ANYOF_DIGIT; break;
14344 case 'D': namedclass = ANYOF_NDIGIT; break;
14345 case 'v': namedclass = ANYOF_VERTWS; break;
14346 case 'V': namedclass = ANYOF_NVERTWS; break;
14347 case 'h': namedclass = ANYOF_HORIZWS; break;
14348 case 'H': namedclass = ANYOF_NHORIZWS; break;
14349 case 'N': /* Handle \N{NAME} in class */
14351 const char * const backslash_N_beg = RExC_parse - 2;
14354 if (! grok_bslash_N(pRExC_state,
14355 NULL, /* No regnode */
14356 &value, /* Yes single value */
14357 &cp_count, /* Multiple code pt count */
14362 if (*flagp & RESTART_UTF8)
14363 FAIL("panic: grok_bslash_N set RESTART_UTF8");
14365 if (cp_count < 0) {
14366 vFAIL("\\N in a character class must be a named character: \\N{...}");
14368 else if (cp_count == 0) {
14370 RExC_parse++; /* Position after the "}" */
14371 vFAIL("Zero length \\N{}");
14374 ckWARNreg(RExC_parse,
14375 "Ignoring zero length \\N{} in character class");
14378 else { /* cp_count > 1 */
14379 if (! RExC_in_multi_char_class) {
14380 if (invert || range || *RExC_parse == '-') {
14383 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
14386 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
14388 break; /* <value> contains the first code
14389 point. Drop out of the switch to
14393 SV * multi_char_N = newSVpvn(backslash_N_beg,
14394 RExC_parse - backslash_N_beg);
14396 = add_multi_match(multi_char_matches,
14401 } /* End of cp_count != 1 */
14403 /* This element should not be processed further in this
14406 value = save_value;
14407 prevvalue = save_prevvalue;
14408 continue; /* Back to top of loop to get next char */
14411 /* Here, is a single code point, and <value> contains it */
14412 unicode_range = TRUE; /* \N{} are Unicode */
14420 /* We will handle any undefined properties ourselves */
14421 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
14422 /* And we actually would prefer to get
14423 * the straight inversion list of the
14424 * swash, since we will be accessing it
14425 * anyway, to save a little time */
14426 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
14428 if (RExC_parse >= RExC_end)
14429 vFAIL2("Empty \\%c{}", (U8)value);
14430 if (*RExC_parse == '{') {
14431 const U8 c = (U8)value;
14432 e = strchr(RExC_parse++, '}');
14434 vFAIL2("Missing right brace on \\%c{}", c);
14435 while (isSPACE(*RExC_parse))
14437 if (e == RExC_parse)
14438 vFAIL2("Empty \\%c{}", c);
14439 n = e - RExC_parse;
14440 while (isSPACE(*(RExC_parse + n - 1)))
14451 if (UCHARAT(RExC_parse) == '^') {
14454 /* toggle. (The rhs xor gets the single bit that
14455 * differs between P and p; the other xor inverts just
14457 value ^= 'P' ^ 'p';
14459 while (isSPACE(*RExC_parse)) {
14464 /* Try to get the definition of the property into
14465 * <invlist>. If /i is in effect, the effective property
14466 * will have its name be <__NAME_i>. The design is
14467 * discussed in commit
14468 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
14469 name = savepv(Perl_form(aTHX_
14471 (FOLD) ? "__" : "",
14477 /* Look up the property name, and get its swash and
14478 * inversion list, if the property is found */
14480 SvREFCNT_dec_NN(swash);
14482 swash = _core_swash_init("utf8", name, &PL_sv_undef,
14485 NULL, /* No inversion list */
14488 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
14489 HV* curpkg = (IN_PERL_COMPILETIME)
14491 : CopSTASH(PL_curcop);
14493 SvREFCNT_dec_NN(swash);
14497 /* Here didn't find it. It could be a user-defined
14498 * property that will be available at run-time. If we
14499 * accept only compile-time properties, is an error;
14500 * otherwise add it to the list for run-time look up */
14502 RExC_parse = e + 1;
14504 "Property '%"UTF8f"' is unknown",
14505 UTF8fARG(UTF, n, name));
14508 /* If the property name doesn't already have a package
14509 * name, add the current one to it so that it can be
14510 * referred to outside it. [perl #121777] */
14511 if (curpkg && ! instr(name, "::")) {
14512 char* pkgname = HvNAME(curpkg);
14513 if (strNE(pkgname, "main")) {
14514 char* full_name = Perl_form(aTHX_
14518 n = strlen(full_name);
14520 name = savepvn(full_name, n);
14523 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
14524 (value == 'p' ? '+' : '!'),
14525 UTF8fARG(UTF, n, name));
14526 has_user_defined_property = TRUE;
14528 /* We don't know yet, so have to assume that the
14529 * property could match something in the Latin1 range,
14530 * hence something that isn't utf8. Note that this
14531 * would cause things in <depends_list> to match
14532 * inappropriately, except that any \p{}, including
14533 * this one forces Unicode semantics, which means there
14534 * is no <depends_list> */
14536 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
14540 /* Here, did get the swash and its inversion list. If
14541 * the swash is from a user-defined property, then this
14542 * whole character class should be regarded as such */
14543 if (swash_init_flags
14544 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
14546 has_user_defined_property = TRUE;
14549 /* We warn on matching an above-Unicode code point
14550 * if the match would return true, except don't
14551 * warn for \p{All}, which has exactly one element
14553 (_invlist_contains_cp(invlist, 0x110000)
14554 && (! (_invlist_len(invlist) == 1
14555 && *invlist_array(invlist) == 0)))
14561 /* Invert if asking for the complement */
14562 if (value == 'P') {
14563 _invlist_union_complement_2nd(properties,
14567 /* The swash can't be used as-is, because we've
14568 * inverted things; delay removing it to here after
14569 * have copied its invlist above */
14570 SvREFCNT_dec_NN(swash);
14574 _invlist_union(properties, invlist, &properties);
14579 RExC_parse = e + 1;
14580 namedclass = ANYOF_UNIPROP; /* no official name, but it's
14583 /* \p means they want Unicode semantics */
14584 RExC_uni_semantics = 1;
14587 case 'n': value = '\n'; break;
14588 case 'r': value = '\r'; break;
14589 case 't': value = '\t'; break;
14590 case 'f': value = '\f'; break;
14591 case 'b': value = '\b'; break;
14592 case 'e': value = ESC_NATIVE; break;
14593 case 'a': value = '\a'; break;
14595 RExC_parse--; /* function expects to be pointed at the 'o' */
14597 const char* error_msg;
14598 bool valid = grok_bslash_o(&RExC_parse,
14601 PASS2, /* warnings only in
14604 silence_non_portable,
14610 non_portable_endpoint++;
14611 if (IN_ENCODING && value < 0x100) {
14612 goto recode_encoding;
14616 RExC_parse--; /* function expects to be pointed at the 'x' */
14618 const char* error_msg;
14619 bool valid = grok_bslash_x(&RExC_parse,
14622 PASS2, /* Output warnings */
14624 silence_non_portable,
14630 non_portable_endpoint++;
14631 if (IN_ENCODING && value < 0x100)
14632 goto recode_encoding;
14635 value = grok_bslash_c(*RExC_parse++, PASS2);
14636 non_portable_endpoint++;
14638 case '0': case '1': case '2': case '3': case '4':
14639 case '5': case '6': case '7':
14641 /* Take 1-3 octal digits */
14642 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
14643 numlen = (strict) ? 4 : 3;
14644 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
14645 RExC_parse += numlen;
14648 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
14649 vFAIL("Need exactly 3 octal digits");
14651 else if (! SIZE_ONLY /* like \08, \178 */
14653 && RExC_parse < RExC_end
14654 && isDIGIT(*RExC_parse)
14655 && ckWARN(WARN_REGEXP))
14657 SAVEFREESV(RExC_rx_sv);
14658 reg_warn_non_literal_string(
14660 form_short_octal_warning(RExC_parse, numlen));
14661 (void)ReREFCNT_inc(RExC_rx_sv);
14664 non_portable_endpoint++;
14665 if (IN_ENCODING && value < 0x100)
14666 goto recode_encoding;
14670 if (! RExC_override_recoding) {
14671 SV* enc = _get_encoding();
14672 value = reg_recode((const char)(U8)value, &enc);
14675 vFAIL("Invalid escape in the specified encoding");
14678 ckWARNreg(RExC_parse,
14679 "Invalid escape in the specified encoding");
14685 /* Allow \_ to not give an error */
14686 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
14688 vFAIL2("Unrecognized escape \\%c in character class",
14692 SAVEFREESV(RExC_rx_sv);
14693 ckWARN2reg(RExC_parse,
14694 "Unrecognized escape \\%c in character class passed through",
14696 (void)ReREFCNT_inc(RExC_rx_sv);
14700 } /* End of switch on char following backslash */
14701 } /* end of handling backslash escape sequences */
14703 /* Here, we have the current token in 'value' */
14705 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
14708 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
14709 * literal, as is the character that began the false range, i.e.
14710 * the 'a' in the examples */
14713 const int w = (RExC_parse >= rangebegin)
14714 ? RExC_parse - rangebegin
14718 "False [] range \"%"UTF8f"\"",
14719 UTF8fARG(UTF, w, rangebegin));
14722 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
14723 ckWARN2reg(RExC_parse,
14724 "False [] range \"%"UTF8f"\"",
14725 UTF8fARG(UTF, w, rangebegin));
14726 (void)ReREFCNT_inc(RExC_rx_sv);
14727 cp_list = add_cp_to_invlist(cp_list, '-');
14728 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14733 range = 0; /* this was not a true range */
14734 element_count += 2; /* So counts for three values */
14737 classnum = namedclass_to_classnum(namedclass);
14739 if (LOC && namedclass < ANYOF_POSIXL_MAX
14740 #ifndef HAS_ISASCII
14741 && classnum != _CC_ASCII
14744 /* What the Posix classes (like \w, [:space:]) match in locale
14745 * isn't knowable under locale until actual match time. Room
14746 * must be reserved (one time per outer bracketed class) to
14747 * store such classes. The space will contain a bit for each
14748 * named class that is to be matched against. This isn't
14749 * needed for \p{} and pseudo-classes, as they are not affected
14750 * by locale, and hence are dealt with separately */
14751 if (! need_class) {
14754 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14757 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14759 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14760 ANYOF_POSIXL_ZERO(ret);
14763 /* Coverity thinks it is possible for this to be negative; both
14764 * jhi and khw think it's not, but be safer */
14765 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14766 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14768 /* See if it already matches the complement of this POSIX
14770 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14771 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14775 posixl_matches_all = TRUE;
14776 break; /* No need to continue. Since it matches both
14777 e.g., \w and \W, it matches everything, and the
14778 bracketed class can be optimized into qr/./s */
14781 /* Add this class to those that should be checked at runtime */
14782 ANYOF_POSIXL_SET(ret, namedclass);
14784 /* The above-Latin1 characters are not subject to locale rules.
14785 * Just add them, in the second pass, to the
14786 * unconditionally-matched list */
14788 SV* scratch_list = NULL;
14790 /* Get the list of the above-Latin1 code points this
14792 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14793 PL_XPosix_ptrs[classnum],
14795 /* Odd numbers are complements, like
14796 * NDIGIT, NASCII, ... */
14797 namedclass % 2 != 0,
14799 /* Checking if 'cp_list' is NULL first saves an extra
14800 * clone. Its reference count will be decremented at the
14801 * next union, etc, or if this is the only instance, at the
14802 * end of the routine */
14804 cp_list = scratch_list;
14807 _invlist_union(cp_list, scratch_list, &cp_list);
14808 SvREFCNT_dec_NN(scratch_list);
14810 continue; /* Go get next character */
14813 else if (! SIZE_ONLY) {
14815 /* Here, not in pass1 (in that pass we skip calculating the
14816 * contents of this class), and is /l, or is a POSIX class for
14817 * which /l doesn't matter (or is a Unicode property, which is
14818 * skipped here). */
14819 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14820 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14822 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14823 * nor /l make a difference in what these match,
14824 * therefore we just add what they match to cp_list. */
14825 if (classnum != _CC_VERTSPACE) {
14826 assert( namedclass == ANYOF_HORIZWS
14827 || namedclass == ANYOF_NHORIZWS);
14829 /* It turns out that \h is just a synonym for
14831 classnum = _CC_BLANK;
14834 _invlist_union_maybe_complement_2nd(
14836 PL_XPosix_ptrs[classnum],
14837 namedclass % 2 != 0, /* Complement if odd
14838 (NHORIZWS, NVERTWS)
14843 else if (UNI_SEMANTICS
14844 || classnum == _CC_ASCII
14845 || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
14846 || classnum == _CC_XDIGIT)))
14848 /* We usually have to worry about /d and /a affecting what
14849 * POSIX classes match, with special code needed for /d
14850 * because we won't know until runtime what all matches.
14851 * But there is no extra work needed under /u, and
14852 * [:ascii:] is unaffected by /a and /d; and :digit: and
14853 * :xdigit: don't have runtime differences under /d. So we
14854 * can special case these, and avoid some extra work below,
14855 * and at runtime. */
14856 _invlist_union_maybe_complement_2nd(
14858 PL_XPosix_ptrs[classnum],
14859 namedclass % 2 != 0,
14862 else { /* Garden variety class. If is NUPPER, NALPHA, ...
14863 complement and use nposixes */
14864 SV** posixes_ptr = namedclass % 2 == 0
14867 _invlist_union_maybe_complement_2nd(
14869 PL_XPosix_ptrs[classnum],
14870 namedclass % 2 != 0,
14874 } /* end of namedclass \blah */
14877 RExC_parse = regpatws(pRExC_state, RExC_parse,
14878 FALSE /* means don't recognize comments */ );
14881 /* If 'range' is set, 'value' is the ending of a range--check its
14882 * validity. (If value isn't a single code point in the case of a
14883 * range, we should have figured that out above in the code that
14884 * catches false ranges). Later, we will handle each individual code
14885 * point in the range. If 'range' isn't set, this could be the
14886 * beginning of a range, so check for that by looking ahead to see if
14887 * the next real character to be processed is the range indicator--the
14892 /* For unicode ranges, we have to test that the Unicode as opposed
14893 * to the native values are not decreasing. (Above 255, there is
14894 * no difference between native and Unicode) */
14895 if (unicode_range && prevvalue < 255 && value < 255) {
14896 if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
14897 goto backwards_range;
14902 if (prevvalue > value) /* b-a */ {
14907 w = RExC_parse - rangebegin;
14909 "Invalid [] range \"%"UTF8f"\"",
14910 UTF8fARG(UTF, w, rangebegin));
14911 NOT_REACHED; /* NOTREACHED */
14915 prevvalue = value; /* save the beginning of the potential range */
14916 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14917 && *RExC_parse == '-')
14919 char* next_char_ptr = RExC_parse + 1;
14920 if (skip_white) { /* Get the next real char after the '-' */
14921 next_char_ptr = regpatws(pRExC_state,
14923 FALSE); /* means don't recognize
14927 /* If the '-' is at the end of the class (just before the ']',
14928 * it is a literal minus; otherwise it is a range */
14929 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14930 RExC_parse = next_char_ptr;
14932 /* a bad range like \w-, [:word:]- ? */
14933 if (namedclass > OOB_NAMEDCLASS) {
14934 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14935 const int w = RExC_parse >= rangebegin
14936 ? RExC_parse - rangebegin
14939 vFAIL4("False [] range \"%*.*s\"",
14944 "False [] range \"%*.*s\"",
14949 cp_list = add_cp_to_invlist(cp_list, '-');
14953 range = 1; /* yeah, it's a range! */
14954 continue; /* but do it the next time */
14959 if (namedclass > OOB_NAMEDCLASS) {
14963 /* Here, we have a single value this time through the loop, and
14964 * <prevvalue> is the beginning of the range, if any; or <value> if
14967 /* non-Latin1 code point implies unicode semantics. Must be set in
14968 * pass1 so is there for the whole of pass 2 */
14970 RExC_uni_semantics = 1;
14973 /* Ready to process either the single value, or the completed range.
14974 * For single-valued non-inverted ranges, we consider the possibility
14975 * of multi-char folds. (We made a conscious decision to not do this
14976 * for the other cases because it can often lead to non-intuitive
14977 * results. For example, you have the peculiar case that:
14978 * "s s" =~ /^[^\xDF]+$/i => Y
14979 * "ss" =~ /^[^\xDF]+$/i => N
14981 * See [perl #89750] */
14982 if (FOLD && allow_multi_folds && value == prevvalue) {
14983 if (value == LATIN_SMALL_LETTER_SHARP_S
14984 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14987 /* Here <value> is indeed a multi-char fold. Get what it is */
14989 U8 foldbuf[UTF8_MAXBYTES_CASE];
14992 UV folded = _to_uni_fold_flags(
14996 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14997 ? FOLD_FLAGS_NOMIX_ASCII
15001 /* Here, <folded> should be the first character of the
15002 * multi-char fold of <value>, with <foldbuf> containing the
15003 * whole thing. But, if this fold is not allowed (because of
15004 * the flags), <fold> will be the same as <value>, and should
15005 * be processed like any other character, so skip the special
15007 if (folded != value) {
15009 /* Skip if we are recursed, currently parsing the class
15010 * again. Otherwise add this character to the list of
15011 * multi-char folds. */
15012 if (! RExC_in_multi_char_class) {
15013 STRLEN cp_count = utf8_length(foldbuf,
15014 foldbuf + foldlen);
15015 SV* multi_fold = sv_2mortal(newSVpvs(""));
15017 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
15020 = add_multi_match(multi_char_matches,
15026 /* This element should not be processed further in this
15029 value = save_value;
15030 prevvalue = save_prevvalue;
15036 if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
15039 /* If the range starts above 255, everything is portable and
15040 * likely to be so for any forseeable character set, so don't
15042 if (unicode_range && non_portable_endpoint && prevvalue < 256) {
15043 vWARN(RExC_parse, "Both or neither range ends should be Unicode");
15045 else if (prevvalue != value) {
15047 /* Under strict, ranges that stop and/or end in an ASCII
15048 * printable should have each end point be a portable value
15049 * for it (preferably like 'A', but we don't warn if it is
15050 * a (portable) Unicode name or code point), and the range
15051 * must be be all digits or all letters of the same case.
15052 * Otherwise, the range is non-portable and unclear as to
15053 * what it contains */
15054 if ((isPRINT_A(prevvalue) || isPRINT_A(value))
15055 && (non_portable_endpoint
15056 || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
15057 || (isLOWER_A(prevvalue) && isLOWER_A(value))
15058 || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
15060 vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
15062 else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
15064 /* But the nature of Unicode and languages mean we
15065 * can't do the same checks for above-ASCII ranges,
15066 * except in the case of digit ones. These should
15067 * contain only digits from the same group of 10. The
15068 * ASCII case is handled just above. 0x660 is the
15069 * first digit character beyond ASCII. Hence here, the
15070 * range could be a range of digits. Find out. */
15071 IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15073 IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
15076 /* If the range start and final points are in the same
15077 * inversion list element, it means that either both
15078 * are not digits, or both are digits in a consecutive
15079 * sequence of digits. (So far, Unicode has kept all
15080 * such sequences as distinct groups of 10, but assert
15081 * to make sure). If the end points are not in the
15082 * same element, neither should be a digit. */
15083 if (index_start == index_final) {
15084 assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
15085 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15086 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15088 /* But actually Unicode did have one group of 11
15089 * 'digits' in 5.2, so in case we are operating
15090 * on that version, let that pass */
15091 || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
15092 - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15094 && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
15098 else if ((index_start >= 0
15099 && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
15100 || (index_final >= 0
15101 && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
15103 vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
15108 if ((! range || prevvalue == value) && non_portable_endpoint) {
15109 if (isPRINT_A(value)) {
15112 if (isBACKSLASHED_PUNCT(value)) {
15113 literal[d++] = '\\';
15115 literal[d++] = (char) value;
15116 literal[d++] = '\0';
15119 "\"%.*s\" is more clearly written simply as \"%s\"",
15120 (int) (RExC_parse - rangebegin),
15125 else if isMNEMONIC_CNTRL(value) {
15127 "\"%.*s\" is more clearly written simply as \"%s\"",
15128 (int) (RExC_parse - rangebegin),
15130 cntrl_to_mnemonic((char) value)
15136 /* Deal with this element of the class */
15140 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15143 /* On non-ASCII platforms, for ranges that span all of 0..255, and
15144 * ones that don't require special handling, we can just add the
15145 * range like we do for ASCII platforms */
15146 if ((UNLIKELY(prevvalue == 0) && value >= 255)
15147 || ! (prevvalue < 256
15149 || (! non_portable_endpoint
15150 && ((isLOWER_A(prevvalue) && isLOWER_A(value))
15151 || (isUPPER_A(prevvalue)
15152 && isUPPER_A(value)))))))
15154 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15158 /* Here, requires special handling. This can be because it is
15159 * a range whose code points are considered to be Unicode, and
15160 * so must be individually translated into native, or because
15161 * its a subrange of 'A-Z' or 'a-z' which each aren't
15162 * contiguous in EBCDIC, but we have defined them to include
15163 * only the "expected" upper or lower case ASCII alphabetics.
15164 * Subranges above 255 are the same in native and Unicode, so
15165 * can be added as a range */
15166 U8 start = NATIVE_TO_LATIN1(prevvalue);
15168 U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
15169 for (j = start; j <= end; j++) {
15170 cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
15173 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
15180 range = 0; /* this range (if it was one) is done now */
15181 } /* End of loop through all the text within the brackets */
15183 /* If anything in the class expands to more than one character, we have to
15184 * deal with them by building up a substitute parse string, and recursively
15185 * calling reg() on it, instead of proceeding */
15186 if (multi_char_matches) {
15187 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
15190 char *save_end = RExC_end;
15191 char *save_parse = RExC_parse;
15192 bool first_time = TRUE; /* First multi-char occurrence doesn't get
15197 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
15198 because too confusing */
15200 sv_catpv(substitute_parse, "(?:");
15204 /* Look at the longest folds first */
15205 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
15207 if (av_exists(multi_char_matches, cp_count)) {
15208 AV** this_array_ptr;
15211 this_array_ptr = (AV**) av_fetch(multi_char_matches,
15213 while ((this_sequence = av_pop(*this_array_ptr)) !=
15216 if (! first_time) {
15217 sv_catpv(substitute_parse, "|");
15219 first_time = FALSE;
15221 sv_catpv(substitute_parse, SvPVX(this_sequence));
15226 /* If the character class contains anything else besides these
15227 * multi-character folds, have to include it in recursive parsing */
15228 if (element_count) {
15229 sv_catpv(substitute_parse, "|[");
15230 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
15231 sv_catpv(substitute_parse, "]");
15234 sv_catpv(substitute_parse, ")");
15237 /* This is a way to get the parse to skip forward a whole named
15238 * sequence instead of matching the 2nd character when it fails the
15240 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
15244 RExC_parse = SvPV(substitute_parse, len);
15245 RExC_end = RExC_parse + len;
15246 RExC_in_multi_char_class = 1;
15247 RExC_override_recoding = 1;
15248 RExC_emit = (regnode *)orig_emit;
15250 ret = reg(pRExC_state, 1, ®_flags, depth+1);
15252 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
15254 RExC_parse = save_parse;
15255 RExC_end = save_end;
15256 RExC_in_multi_char_class = 0;
15257 RExC_override_recoding = 0;
15258 SvREFCNT_dec_NN(multi_char_matches);
15262 /* Here, we've gone through the entire class and dealt with multi-char
15263 * folds. We are now in a position that we can do some checks to see if we
15264 * can optimize this ANYOF node into a simpler one, even in Pass 1.
15265 * Currently we only do two checks:
15266 * 1) is in the unlikely event that the user has specified both, eg. \w and
15267 * \W under /l, then the class matches everything. (This optimization
15268 * is done only to make the optimizer code run later work.)
15269 * 2) if the character class contains only a single element (including a
15270 * single range), we see if there is an equivalent node for it.
15271 * Other checks are possible */
15272 if (! ret_invlist /* Can't optimize if returning the constructed
15274 && (UNLIKELY(posixl_matches_all) || element_count == 1))
15279 if (UNLIKELY(posixl_matches_all)) {
15282 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
15283 \w or [:digit:] or \p{foo}
15286 /* All named classes are mapped into POSIXish nodes, with its FLAG
15287 * argument giving which class it is */
15288 switch ((I32)namedclass) {
15289 case ANYOF_UNIPROP:
15292 /* These don't depend on the charset modifiers. They always
15293 * match under /u rules */
15294 case ANYOF_NHORIZWS:
15295 case ANYOF_HORIZWS:
15296 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
15299 case ANYOF_NVERTWS:
15304 /* The actual POSIXish node for all the rest depends on the
15305 * charset modifier. The ones in the first set depend only on
15306 * ASCII or, if available on this platform, also locale */
15310 op = (LOC) ? POSIXL : POSIXA;
15316 /* The following don't have any matches in the upper Latin1
15317 * range, hence /d is equivalent to /u for them. Making it /u
15318 * saves some branches at runtime */
15322 case ANYOF_NXDIGIT:
15323 if (! DEPENDS_SEMANTICS) {
15324 goto treat_as_default;
15330 /* The following change to CASED under /i */
15336 namedclass = ANYOF_CASED + (namedclass % 2);
15340 /* The rest have more possibilities depending on the charset.
15341 * We take advantage of the enum ordering of the charset
15342 * modifiers to get the exact node type, */
15345 op = POSIXD + get_regex_charset(RExC_flags);
15346 if (op > POSIXA) { /* /aa is same as /a */
15351 /* The odd numbered ones are the complements of the
15352 * next-lower even number one */
15353 if (namedclass % 2 == 1) {
15357 arg = namedclass_to_classnum(namedclass);
15361 else if (value == prevvalue) {
15363 /* Here, the class consists of just a single code point */
15366 if (! LOC && value == '\n') {
15367 op = REG_ANY; /* Optimize [^\n] */
15368 *flagp |= HASWIDTH|SIMPLE;
15372 else if (value < 256 || UTF) {
15374 /* Optimize a single value into an EXACTish node, but not if it
15375 * would require converting the pattern to UTF-8. */
15376 op = compute_EXACTish(pRExC_state);
15378 } /* Otherwise is a range */
15379 else if (! LOC) { /* locale could vary these */
15380 if (prevvalue == '0') {
15381 if (value == '9') {
15386 else if (! FOLD || ASCII_FOLD_RESTRICTED) {
15387 /* We can optimize A-Z or a-z, but not if they could match
15388 * something like the KELVIN SIGN under /i. */
15389 if (prevvalue == 'A') {
15392 && ! non_portable_endpoint
15395 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
15399 else if (prevvalue == 'a') {
15402 && ! non_portable_endpoint
15405 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
15412 /* Here, we have changed <op> away from its initial value iff we found
15413 * an optimization */
15416 /* Throw away this ANYOF regnode, and emit the calculated one,
15417 * which should correspond to the beginning, not current, state of
15419 const char * cur_parse = RExC_parse;
15420 RExC_parse = (char *)orig_parse;
15424 /* To get locale nodes to not use the full ANYOF size would
15425 * require moving the code above that writes the portions
15426 * of it that aren't in other nodes to after this point.
15427 * e.g. ANYOF_POSIXL_SET */
15428 RExC_size = orig_size;
15432 RExC_emit = (regnode *)orig_emit;
15433 if (PL_regkind[op] == POSIXD) {
15434 if (op == POSIXL) {
15435 RExC_contains_locale = 1;
15438 op += NPOSIXD - POSIXD;
15443 ret = reg_node(pRExC_state, op);
15445 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
15449 *flagp |= HASWIDTH|SIMPLE;
15451 else if (PL_regkind[op] == EXACT) {
15452 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15453 TRUE /* downgradable to EXACT */
15457 RExC_parse = (char *) cur_parse;
15459 SvREFCNT_dec(posixes);
15460 SvREFCNT_dec(nposixes);
15461 SvREFCNT_dec(simple_posixes);
15462 SvREFCNT_dec(cp_list);
15463 SvREFCNT_dec(cp_foldable_list);
15470 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
15472 /* If folding, we calculate all characters that could fold to or from the
15473 * ones already on the list */
15474 if (cp_foldable_list) {
15476 UV start, end; /* End points of code point ranges */
15478 SV* fold_intersection = NULL;
15481 /* Our calculated list will be for Unicode rules. For locale
15482 * matching, we have to keep a separate list that is consulted at
15483 * runtime only when the locale indicates Unicode rules. For
15484 * non-locale, we just use to the general list */
15486 use_list = &only_utf8_locale_list;
15489 use_list = &cp_list;
15492 /* Only the characters in this class that participate in folds need
15493 * be checked. Get the intersection of this class and all the
15494 * possible characters that are foldable. This can quickly narrow
15495 * down a large class */
15496 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
15497 &fold_intersection);
15499 /* The folds for all the Latin1 characters are hard-coded into this
15500 * program, but we have to go out to disk to get the others. */
15501 if (invlist_highest(cp_foldable_list) >= 256) {
15503 /* This is a hash that for a particular fold gives all
15504 * characters that are involved in it */
15505 if (! PL_utf8_foldclosures) {
15506 _load_PL_utf8_foldclosures();
15510 /* Now look at the foldable characters in this class individually */
15511 invlist_iterinit(fold_intersection);
15512 while (invlist_iternext(fold_intersection, &start, &end)) {
15515 /* Look at every character in the range */
15516 for (j = start; j <= end; j++) {
15517 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
15523 if (IS_IN_SOME_FOLD_L1(j)) {
15525 /* ASCII is always matched; non-ASCII is matched
15526 * only under Unicode rules (which could happen
15527 * under /l if the locale is a UTF-8 one */
15528 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
15529 *use_list = add_cp_to_invlist(*use_list,
15530 PL_fold_latin1[j]);
15534 add_cp_to_invlist(depends_list,
15535 PL_fold_latin1[j]);
15539 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
15540 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
15542 add_above_Latin1_folds(pRExC_state,
15549 /* Here is an above Latin1 character. We don't have the
15550 * rules hard-coded for it. First, get its fold. This is
15551 * the simple fold, as the multi-character folds have been
15552 * handled earlier and separated out */
15553 _to_uni_fold_flags(j, foldbuf, &foldlen,
15554 (ASCII_FOLD_RESTRICTED)
15555 ? FOLD_FLAGS_NOMIX_ASCII
15558 /* Single character fold of above Latin1. Add everything in
15559 * its fold closure to the list that this node should match.
15560 * The fold closures data structure is a hash with the keys
15561 * being the UTF-8 of every character that is folded to, like
15562 * 'k', and the values each an array of all code points that
15563 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
15564 * Multi-character folds are not included */
15565 if ((listp = hv_fetch(PL_utf8_foldclosures,
15566 (char *) foldbuf, foldlen, FALSE)))
15568 AV* list = (AV*) *listp;
15570 for (k = 0; k <= av_tindex(list); k++) {
15571 SV** c_p = av_fetch(list, k, FALSE);
15577 /* /aa doesn't allow folds between ASCII and non- */
15578 if ((ASCII_FOLD_RESTRICTED
15579 && (isASCII(c) != isASCII(j))))
15584 /* Folds under /l which cross the 255/256 boundary
15585 * are added to a separate list. (These are valid
15586 * only when the locale is UTF-8.) */
15587 if (c < 256 && LOC) {
15588 *use_list = add_cp_to_invlist(*use_list, c);
15592 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
15594 cp_list = add_cp_to_invlist(cp_list, c);
15597 /* Similarly folds involving non-ascii Latin1
15598 * characters under /d are added to their list */
15599 depends_list = add_cp_to_invlist(depends_list,
15606 SvREFCNT_dec_NN(fold_intersection);
15609 /* Now that we have finished adding all the folds, there is no reason
15610 * to keep the foldable list separate */
15611 _invlist_union(cp_list, cp_foldable_list, &cp_list);
15612 SvREFCNT_dec_NN(cp_foldable_list);
15615 /* And combine the result (if any) with any inversion list from posix
15616 * classes. The lists are kept separate up to now because we don't want to
15617 * fold the classes (folding of those is automatically handled by the swash
15618 * fetching code) */
15619 if (simple_posixes) {
15620 _invlist_union(cp_list, simple_posixes, &cp_list);
15621 SvREFCNT_dec_NN(simple_posixes);
15623 if (posixes || nposixes) {
15624 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
15625 /* Under /a and /aa, nothing above ASCII matches these */
15626 _invlist_intersection(posixes,
15627 PL_XPosix_ptrs[_CC_ASCII],
15631 if (DEPENDS_SEMANTICS) {
15632 /* Under /d, everything in the upper half of the Latin1 range
15633 * matches these complements */
15634 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
15636 else if (AT_LEAST_ASCII_RESTRICTED) {
15637 /* Under /a and /aa, everything above ASCII matches these
15639 _invlist_union_complement_2nd(nposixes,
15640 PL_XPosix_ptrs[_CC_ASCII],
15644 _invlist_union(posixes, nposixes, &posixes);
15645 SvREFCNT_dec_NN(nposixes);
15648 posixes = nposixes;
15651 if (! DEPENDS_SEMANTICS) {
15653 _invlist_union(cp_list, posixes, &cp_list);
15654 SvREFCNT_dec_NN(posixes);
15661 /* Under /d, we put into a separate list the Latin1 things that
15662 * match only when the target string is utf8 */
15663 SV* nonascii_but_latin1_properties = NULL;
15664 _invlist_intersection(posixes, PL_UpperLatin1,
15665 &nonascii_but_latin1_properties);
15666 _invlist_subtract(posixes, nonascii_but_latin1_properties,
15669 _invlist_union(cp_list, posixes, &cp_list);
15670 SvREFCNT_dec_NN(posixes);
15676 if (depends_list) {
15677 _invlist_union(depends_list, nonascii_but_latin1_properties,
15679 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
15682 depends_list = nonascii_but_latin1_properties;
15687 /* And combine the result (if any) with any inversion list from properties.
15688 * The lists are kept separate up to now so that we can distinguish the two
15689 * in regards to matching above-Unicode. A run-time warning is generated
15690 * if a Unicode property is matched against a non-Unicode code point. But,
15691 * we allow user-defined properties to match anything, without any warning,
15692 * and we also suppress the warning if there is a portion of the character
15693 * class that isn't a Unicode property, and which matches above Unicode, \W
15694 * or [\x{110000}] for example.
15695 * (Note that in this case, unlike the Posix one above, there is no
15696 * <depends_list>, because having a Unicode property forces Unicode
15701 /* If it matters to the final outcome, see if a non-property
15702 * component of the class matches above Unicode. If so, the
15703 * warning gets suppressed. This is true even if just a single
15704 * such code point is specified, as though not strictly correct if
15705 * another such code point is matched against, the fact that they
15706 * are using above-Unicode code points indicates they should know
15707 * the issues involved */
15709 warn_super = ! (invert
15710 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
15713 _invlist_union(properties, cp_list, &cp_list);
15714 SvREFCNT_dec_NN(properties);
15717 cp_list = properties;
15721 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
15725 /* Here, we have calculated what code points should be in the character
15728 * Now we can see about various optimizations. Fold calculation (which we
15729 * did above) needs to take place before inversion. Otherwise /[^k]/i
15730 * would invert to include K, which under /i would match k, which it
15731 * shouldn't. Therefore we can't invert folded locale now, as it won't be
15732 * folded until runtime */
15734 /* If we didn't do folding, it's because some information isn't available
15735 * until runtime; set the run-time fold flag for these. (We don't have to
15736 * worry about properties folding, as that is taken care of by the swash
15737 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
15738 * locales, or the class matches at least one 0-255 range code point */
15740 if (only_utf8_locale_list) {
15741 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15743 else if (cp_list) { /* Look to see if there a 0-255 code point is in
15746 invlist_iterinit(cp_list);
15747 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
15748 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
15750 invlist_iterfinish(cp_list);
15754 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
15755 * at compile time. Besides not inverting folded locale now, we can't
15756 * invert if there are things such as \w, which aren't known until runtime
15760 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15762 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15764 _invlist_invert(cp_list);
15766 /* Any swash can't be used as-is, because we've inverted things */
15768 SvREFCNT_dec_NN(swash);
15772 /* Clear the invert flag since have just done it here */
15779 *ret_invlist = cp_list;
15780 SvREFCNT_dec(swash);
15782 /* Discard the generated node */
15784 RExC_size = orig_size;
15787 RExC_emit = orig_emit;
15792 /* Some character classes are equivalent to other nodes. Such nodes take
15793 * up less room and generally fewer operations to execute than ANYOF nodes.
15794 * Above, we checked for and optimized into some such equivalents for
15795 * certain common classes that are easy to test. Getting to this point in
15796 * the code means that the class didn't get optimized there. Since this
15797 * code is only executed in Pass 2, it is too late to save space--it has
15798 * been allocated in Pass 1, and currently isn't given back. But turning
15799 * things into an EXACTish node can allow the optimizer to join it to any
15800 * adjacent such nodes. And if the class is equivalent to things like /./,
15801 * expensive run-time swashes can be avoided. Now that we have more
15802 * complete information, we can find things necessarily missed by the
15803 * earlier code. I (khw) am not sure how much to look for here. It would
15804 * be easy, but perhaps too slow, to check any candidates against all the
15805 * node types they could possibly match using _invlistEQ(). */
15810 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
15811 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15813 /* We don't optimize if we are supposed to make sure all non-Unicode
15814 * code points raise a warning, as only ANYOF nodes have this check.
15816 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
15819 U8 op = END; /* The optimzation node-type */
15820 const char * cur_parse= RExC_parse;
15822 invlist_iterinit(cp_list);
15823 if (! invlist_iternext(cp_list, &start, &end)) {
15825 /* Here, the list is empty. This happens, for example, when a
15826 * Unicode property is the only thing in the character class, and
15827 * it doesn't match anything. (perluniprops.pod notes such
15830 *flagp |= HASWIDTH|SIMPLE;
15832 else if (start == end) { /* The range is a single code point */
15833 if (! invlist_iternext(cp_list, &start, &end)
15835 /* Don't do this optimization if it would require changing
15836 * the pattern to UTF-8 */
15837 && (start < 256 || UTF))
15839 /* Here, the list contains a single code point. Can optimize
15840 * into an EXACTish node */
15851 /* A locale node under folding with one code point can be
15852 * an EXACTFL, as its fold won't be calculated until
15858 /* Here, we are generally folding, but there is only one
15859 * code point to match. If we have to, we use an EXACT
15860 * node, but it would be better for joining with adjacent
15861 * nodes in the optimization pass if we used the same
15862 * EXACTFish node that any such are likely to be. We can
15863 * do this iff the code point doesn't participate in any
15864 * folds. For example, an EXACTF of a colon is the same as
15865 * an EXACT one, since nothing folds to or from a colon. */
15867 if (IS_IN_SOME_FOLD_L1(value)) {
15872 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
15877 /* If we haven't found the node type, above, it means we
15878 * can use the prevailing one */
15880 op = compute_EXACTish(pRExC_state);
15885 else if (start == 0) {
15886 if (end == UV_MAX) {
15888 *flagp |= HASWIDTH|SIMPLE;
15891 else if (end == '\n' - 1
15892 && invlist_iternext(cp_list, &start, &end)
15893 && start == '\n' + 1 && end == UV_MAX)
15896 *flagp |= HASWIDTH|SIMPLE;
15900 invlist_iterfinish(cp_list);
15903 RExC_parse = (char *)orig_parse;
15904 RExC_emit = (regnode *)orig_emit;
15906 ret = reg_node(pRExC_state, op);
15908 RExC_parse = (char *)cur_parse;
15910 if (PL_regkind[op] == EXACT) {
15911 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15912 TRUE /* downgradable to EXACT */
15916 SvREFCNT_dec_NN(cp_list);
15921 /* Here, <cp_list> contains all the code points we can determine at
15922 * compile time that match under all conditions. Go through it, and
15923 * for things that belong in the bitmap, put them there, and delete from
15924 * <cp_list>. While we are at it, see if everything above 255 is in the
15925 * list, and if so, set a flag to speed up execution */
15927 populate_ANYOF_from_invlist(ret, &cp_list);
15930 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15933 /* Here, the bitmap has been populated with all the Latin1 code points that
15934 * always match. Can now add to the overall list those that match only
15935 * when the target string is UTF-8 (<depends_list>). */
15936 if (depends_list) {
15938 _invlist_union(cp_list, depends_list, &cp_list);
15939 SvREFCNT_dec_NN(depends_list);
15942 cp_list = depends_list;
15944 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15947 /* If there is a swash and more than one element, we can't use the swash in
15948 * the optimization below. */
15949 if (swash && element_count > 1) {
15950 SvREFCNT_dec_NN(swash);
15954 /* Note that the optimization of using 'swash' if it is the only thing in
15955 * the class doesn't have us change swash at all, so it can include things
15956 * that are also in the bitmap; otherwise we have purposely deleted that
15957 * duplicate information */
15958 set_ANYOF_arg(pRExC_state, ret, cp_list,
15959 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15961 only_utf8_locale_list,
15962 swash, has_user_defined_property);
15964 *flagp |= HASWIDTH|SIMPLE;
15966 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15967 RExC_contains_locale = 1;
15973 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15976 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15977 regnode* const node,
15979 SV* const runtime_defns,
15980 SV* const only_utf8_locale_list,
15982 const bool has_user_defined_property)
15984 /* Sets the arg field of an ANYOF-type node 'node', using information about
15985 * the node passed-in. If there is nothing outside the node's bitmap, the
15986 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15987 * the count returned by add_data(), having allocated and stored an array,
15988 * av, that that count references, as follows:
15989 * av[0] stores the character class description in its textual form.
15990 * This is used later (regexec.c:Perl_regclass_swash()) to
15991 * initialize the appropriate swash, and is also useful for dumping
15992 * the regnode. This is set to &PL_sv_undef if the textual
15993 * description is not needed at run-time (as happens if the other
15994 * elements completely define the class)
15995 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15996 * computed from av[0]. But if no further computation need be done,
15997 * the swash is stored here now (and av[0] is &PL_sv_undef).
15998 * av[2] stores the inversion list of code points that match only if the
15999 * current locale is UTF-8
16000 * av[3] stores the cp_list inversion list for use in addition or instead
16001 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
16002 * (Otherwise everything needed is already in av[0] and av[1])
16003 * av[4] is set if any component of the class is from a user-defined
16004 * property; used only if av[3] exists */
16008 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
16010 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
16011 assert(! (ANYOF_FLAGS(node)
16012 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16013 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
16014 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
16017 AV * const av = newAV();
16020 assert(ANYOF_FLAGS(node)
16021 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16022 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16024 av_store(av, 0, (runtime_defns)
16025 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
16028 av_store(av, 1, swash);
16029 SvREFCNT_dec_NN(cp_list);
16032 av_store(av, 1, &PL_sv_undef);
16034 av_store(av, 3, cp_list);
16035 av_store(av, 4, newSVuv(has_user_defined_property));
16039 if (only_utf8_locale_list) {
16040 av_store(av, 2, only_utf8_locale_list);
16043 av_store(av, 2, &PL_sv_undef);
16046 rv = newRV_noinc(MUTABLE_SV(av));
16047 n = add_data(pRExC_state, STR_WITH_LEN("s"));
16048 RExC_rxi->data->data[n] = (void*)rv;
16053 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
16055 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
16056 const regnode* node,
16059 SV** only_utf8_locale_ptr,
16063 /* For internal core use only.
16064 * Returns the swash for the input 'node' in the regex 'prog'.
16065 * If <doinit> is 'true', will attempt to create the swash if not already
16067 * If <listsvp> is non-null, will return the printable contents of the
16068 * swash. This can be used to get debugging information even before the
16069 * swash exists, by calling this function with 'doinit' set to false, in
16070 * which case the components that will be used to eventually create the
16071 * swash are returned (in a printable form).
16072 * If <exclude_list> is not NULL, it is an inversion list of things to
16073 * exclude from what's returned in <listsvp>.
16074 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
16075 * that, in spite of this function's name, the swash it returns may include
16076 * the bitmap data as well */
16079 SV *si = NULL; /* Input swash initialization string */
16080 SV* invlist = NULL;
16082 RXi_GET_DECL(prog,progi);
16083 const struct reg_data * const data = prog ? progi->data : NULL;
16085 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
16087 assert(ANYOF_FLAGS(node)
16088 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16089 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
16091 if (data && data->count) {
16092 const U32 n = ARG(node);
16094 if (data->what[n] == 's') {
16095 SV * const rv = MUTABLE_SV(data->data[n]);
16096 AV * const av = MUTABLE_AV(SvRV(rv));
16097 SV **const ary = AvARRAY(av);
16098 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
16100 si = *ary; /* ary[0] = the string to initialize the swash with */
16102 /* Elements 3 and 4 are either both present or both absent. [3] is
16103 * any inversion list generated at compile time; [4] indicates if
16104 * that inversion list has any user-defined properties in it. */
16105 if (av_tindex(av) >= 2) {
16106 if (only_utf8_locale_ptr
16108 && ary[2] != &PL_sv_undef)
16110 *only_utf8_locale_ptr = ary[2];
16113 assert(only_utf8_locale_ptr);
16114 *only_utf8_locale_ptr = NULL;
16117 if (av_tindex(av) >= 3) {
16119 if (SvUV(ary[4])) {
16120 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
16128 /* Element [1] is reserved for the set-up swash. If already there,
16129 * return it; if not, create it and store it there */
16130 if (ary[1] && SvROK(ary[1])) {
16133 else if (doinit && ((si && si != &PL_sv_undef)
16134 || (invlist && invlist != &PL_sv_undef))) {
16136 sw = _core_swash_init("utf8", /* the utf8 package */
16140 0, /* not from tr/// */
16142 &swash_init_flags);
16143 (void)av_store(av, 1, sw);
16148 /* If requested, return a printable version of what this swash matches */
16150 SV* matches_string = newSVpvs("");
16152 /* The swash should be used, if possible, to get the data, as it
16153 * contains the resolved data. But this function can be called at
16154 * compile-time, before everything gets resolved, in which case we
16155 * return the currently best available information, which is the string
16156 * that will eventually be used to do that resolving, 'si' */
16157 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
16158 && (si && si != &PL_sv_undef))
16160 sv_catsv(matches_string, si);
16163 /* Add the inversion list to whatever we have. This may have come from
16164 * the swash, or from an input parameter */
16166 if (exclude_list) {
16167 SV* clone = invlist_clone(invlist);
16168 _invlist_subtract(clone, exclude_list, &clone);
16169 sv_catsv(matches_string, _invlist_contents(clone));
16170 SvREFCNT_dec_NN(clone);
16173 sv_catsv(matches_string, _invlist_contents(invlist));
16176 *listsvp = matches_string;
16181 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
16183 /* reg_skipcomment()
16185 Absorbs an /x style # comment from the input stream,
16186 returning a pointer to the first character beyond the comment, or if the
16187 comment terminates the pattern without anything following it, this returns
16188 one past the final character of the pattern (in other words, RExC_end) and
16189 sets the REG_RUN_ON_COMMENT_SEEN flag.
16191 Note it's the callers responsibility to ensure that we are
16192 actually in /x mode
16196 PERL_STATIC_INLINE char*
16197 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
16199 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
16203 while (p < RExC_end) {
16204 if (*(++p) == '\n') {
16209 /* we ran off the end of the pattern without ending the comment, so we have
16210 * to add an \n when wrapping */
16211 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
16217 Advances the parse position, and optionally absorbs
16218 "whitespace" from the inputstream.
16220 Without /x "whitespace" means (?#...) style comments only,
16221 with /x this means (?#...) and # comments and whitespace proper.
16223 Returns the RExC_parse point from BEFORE the scan occurs.
16225 This is the /x friendly way of saying RExC_parse++.
16229 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
16231 char* const retval = RExC_parse++;
16233 PERL_ARGS_ASSERT_NEXTCHAR;
16236 if (RExC_end - RExC_parse >= 3
16237 && *RExC_parse == '('
16238 && RExC_parse[1] == '?'
16239 && RExC_parse[2] == '#')
16241 while (*RExC_parse != ')') {
16242 if (RExC_parse == RExC_end)
16243 FAIL("Sequence (?#... not terminated");
16249 if (RExC_flags & RXf_PMf_EXTENDED) {
16250 char * p = regpatws(pRExC_state, RExC_parse,
16251 TRUE); /* means recognize comments */
16252 if (p != RExC_parse) {
16262 S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
16264 /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra
16265 * space. In pass1, it aligns and increments RExC_size; in pass2,
16268 regnode * const ret = RExC_emit;
16269 GET_RE_DEBUG_FLAGS_DECL;
16271 PERL_ARGS_ASSERT_REGNODE_GUTS;
16273 assert(extra_size >= regarglen[op]);
16276 SIZE_ALIGN(RExC_size);
16277 RExC_size += 1 + extra_size;
16280 if (RExC_emit >= RExC_emit_bound)
16281 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
16282 op, (void*)RExC_emit, (void*)RExC_emit_bound);
16284 NODE_ALIGN_FILL(ret);
16285 #ifndef RE_TRACK_PATTERN_OFFSETS
16286 PERL_UNUSED_ARG(name);
16288 if (RExC_offsets) { /* MJD */
16290 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
16293 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
16294 ? "Overwriting end of array!\n" : "OK",
16295 (UV)(RExC_emit - RExC_emit_start),
16296 (UV)(RExC_parse - RExC_start),
16297 (UV)RExC_offsets[0]));
16298 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
16305 - reg_node - emit a node
16307 STATIC regnode * /* Location. */
16308 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
16310 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
16312 PERL_ARGS_ASSERT_REG_NODE;
16314 assert(regarglen[op] == 0);
16317 regnode *ptr = ret;
16318 FILL_ADVANCE_NODE(ptr, op);
16319 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
16326 - reganode - emit a node with an argument
16328 STATIC regnode * /* Location. */
16329 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
16331 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
16333 PERL_ARGS_ASSERT_REGANODE;
16335 assert(regarglen[op] == 1);
16338 regnode *ptr = ret;
16339 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
16340 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
16347 S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2)
16349 /* emit a node with U32 and I32 arguments */
16351 regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
16353 PERL_ARGS_ASSERT_REG2LANODE;
16355 assert(regarglen[op] == 2);
16358 regnode *ptr = ret;
16359 FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
16366 - reginsert - insert an operator in front of already-emitted operand
16368 * Means relocating the operand.
16371 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
16376 const int offset = regarglen[(U8)op];
16377 const int size = NODE_STEP_REGNODE + offset;
16378 GET_RE_DEBUG_FLAGS_DECL;
16380 PERL_ARGS_ASSERT_REGINSERT;
16381 PERL_UNUSED_CONTEXT;
16382 PERL_UNUSED_ARG(depth);
16383 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
16384 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
16393 if (RExC_open_parens) {
16395 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
16396 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
16397 if ( RExC_open_parens[paren] >= opnd ) {
16398 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
16399 RExC_open_parens[paren] += size;
16401 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
16403 if ( RExC_close_parens[paren] >= opnd ) {
16404 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
16405 RExC_close_parens[paren] += size;
16407 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
16412 while (src > opnd) {
16413 StructCopy(--src, --dst, regnode);
16414 #ifdef RE_TRACK_PATTERN_OFFSETS
16415 if (RExC_offsets) { /* MJD 20010112 */
16417 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
16421 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
16422 ? "Overwriting end of array!\n" : "OK",
16423 (UV)(src - RExC_emit_start),
16424 (UV)(dst - RExC_emit_start),
16425 (UV)RExC_offsets[0]));
16426 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
16427 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
16433 place = opnd; /* Op node, where operand used to be. */
16434 #ifdef RE_TRACK_PATTERN_OFFSETS
16435 if (RExC_offsets) { /* MJD */
16437 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
16441 (UV)(place - RExC_emit_start) > RExC_offsets[0]
16442 ? "Overwriting end of array!\n" : "OK",
16443 (UV)(place - RExC_emit_start),
16444 (UV)(RExC_parse - RExC_start),
16445 (UV)RExC_offsets[0]));
16446 Set_Node_Offset(place, RExC_parse);
16447 Set_Node_Length(place, 1);
16450 src = NEXTOPER(place);
16451 FILL_ADVANCE_NODE(place, op);
16452 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
16453 Zero(src, offset, regnode);
16457 - regtail - set the next-pointer at the end of a node chain of p to val.
16458 - SEE ALSO: regtail_study
16460 /* TODO: All three parms should be const */
16462 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16463 const regnode *val,U32 depth)
16466 GET_RE_DEBUG_FLAGS_DECL;
16468 PERL_ARGS_ASSERT_REGTAIL;
16470 PERL_UNUSED_ARG(depth);
16476 /* Find last node. */
16479 regnode * const temp = regnext(scan);
16481 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
16482 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16483 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
16484 SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
16485 (temp == NULL ? "->" : ""),
16486 (temp == NULL ? PL_reg_name[OP(val)] : "")
16494 if (reg_off_by_arg[OP(scan)]) {
16495 ARG_SET(scan, val - scan);
16498 NEXT_OFF(scan) = val - scan;
16504 - regtail_study - set the next-pointer at the end of a node chain of p to val.
16505 - Look for optimizable sequences at the same time.
16506 - currently only looks for EXACT chains.
16508 This is experimental code. The idea is to use this routine to perform
16509 in place optimizations on branches and groups as they are constructed,
16510 with the long term intention of removing optimization from study_chunk so
16511 that it is purely analytical.
16513 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
16514 to control which is which.
16517 /* TODO: All four parms should be const */
16520 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
16521 const regnode *val,U32 depth)
16525 #ifdef EXPERIMENTAL_INPLACESCAN
16528 GET_RE_DEBUG_FLAGS_DECL;
16530 PERL_ARGS_ASSERT_REGTAIL_STUDY;
16536 /* Find last node. */
16540 regnode * const temp = regnext(scan);
16541 #ifdef EXPERIMENTAL_INPLACESCAN
16542 if (PL_regkind[OP(scan)] == EXACT) {
16543 bool unfolded_multi_char; /* Unexamined in this routine */
16544 if (join_exact(pRExC_state, scan, &min,
16545 &unfolded_multi_char, 1, val, depth+1))
16550 switch (OP(scan)) {
16554 case EXACTFA_NO_TRIE:
16560 if( exact == PSEUDO )
16562 else if ( exact != OP(scan) )
16571 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
16572 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
16573 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
16574 SvPV_nolen_const(RExC_mysv),
16575 REG_NODE_NUM(scan),
16576 PL_reg_name[exact]);
16583 DEBUG_PARSE_MSG("");
16584 regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
16585 PerlIO_printf(Perl_debug_log,
16586 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
16587 SvPV_nolen_const(RExC_mysv),
16588 (IV)REG_NODE_NUM(val),
16592 if (reg_off_by_arg[OP(scan)]) {
16593 ARG_SET(scan, val - scan);
16596 NEXT_OFF(scan) = val - scan;
16604 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
16609 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
16614 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16616 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
16617 if (flags & (1<<bit)) {
16618 if (!set++ && lead)
16619 PerlIO_printf(Perl_debug_log, "%s",lead);
16620 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
16625 PerlIO_printf(Perl_debug_log, "\n");
16627 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16632 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
16638 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
16640 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
16641 if (flags & (1<<bit)) {
16642 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
16645 if (!set++ && lead)
16646 PerlIO_printf(Perl_debug_log, "%s",lead);
16647 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
16650 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
16651 if (!set++ && lead) {
16652 PerlIO_printf(Perl_debug_log, "%s",lead);
16655 case REGEX_UNICODE_CHARSET:
16656 PerlIO_printf(Perl_debug_log, "UNICODE");
16658 case REGEX_LOCALE_CHARSET:
16659 PerlIO_printf(Perl_debug_log, "LOCALE");
16661 case REGEX_ASCII_RESTRICTED_CHARSET:
16662 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
16664 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
16665 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
16668 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
16674 PerlIO_printf(Perl_debug_log, "\n");
16676 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
16682 Perl_regdump(pTHX_ const regexp *r)
16685 SV * const sv = sv_newmortal();
16686 SV *dsv= sv_newmortal();
16687 RXi_GET_DECL(r,ri);
16688 GET_RE_DEBUG_FLAGS_DECL;
16690 PERL_ARGS_ASSERT_REGDUMP;
16692 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
16694 /* Header fields of interest. */
16695 if (r->anchored_substr) {
16696 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
16697 RE_SV_DUMPLEN(r->anchored_substr), 30);
16698 PerlIO_printf(Perl_debug_log,
16699 "anchored %s%s at %"IVdf" ",
16700 s, RE_SV_TAIL(r->anchored_substr),
16701 (IV)r->anchored_offset);
16702 } else if (r->anchored_utf8) {
16703 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
16704 RE_SV_DUMPLEN(r->anchored_utf8), 30);
16705 PerlIO_printf(Perl_debug_log,
16706 "anchored utf8 %s%s at %"IVdf" ",
16707 s, RE_SV_TAIL(r->anchored_utf8),
16708 (IV)r->anchored_offset);
16710 if (r->float_substr) {
16711 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
16712 RE_SV_DUMPLEN(r->float_substr), 30);
16713 PerlIO_printf(Perl_debug_log,
16714 "floating %s%s at %"IVdf"..%"UVuf" ",
16715 s, RE_SV_TAIL(r->float_substr),
16716 (IV)r->float_min_offset, (UV)r->float_max_offset);
16717 } else if (r->float_utf8) {
16718 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
16719 RE_SV_DUMPLEN(r->float_utf8), 30);
16720 PerlIO_printf(Perl_debug_log,
16721 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
16722 s, RE_SV_TAIL(r->float_utf8),
16723 (IV)r->float_min_offset, (UV)r->float_max_offset);
16725 if (r->check_substr || r->check_utf8)
16726 PerlIO_printf(Perl_debug_log,
16728 (r->check_substr == r->float_substr
16729 && r->check_utf8 == r->float_utf8
16730 ? "(checking floating" : "(checking anchored"));
16731 if (r->intflags & PREGf_NOSCAN)
16732 PerlIO_printf(Perl_debug_log, " noscan");
16733 if (r->extflags & RXf_CHECK_ALL)
16734 PerlIO_printf(Perl_debug_log, " isall");
16735 if (r->check_substr || r->check_utf8)
16736 PerlIO_printf(Perl_debug_log, ") ");
16738 if (ri->regstclass) {
16739 regprop(r, sv, ri->regstclass, NULL, NULL);
16740 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
16742 if (r->intflags & PREGf_ANCH) {
16743 PerlIO_printf(Perl_debug_log, "anchored");
16744 if (r->intflags & PREGf_ANCH_MBOL)
16745 PerlIO_printf(Perl_debug_log, "(MBOL)");
16746 if (r->intflags & PREGf_ANCH_SBOL)
16747 PerlIO_printf(Perl_debug_log, "(SBOL)");
16748 if (r->intflags & PREGf_ANCH_GPOS)
16749 PerlIO_printf(Perl_debug_log, "(GPOS)");
16750 PerlIO_putc(Perl_debug_log, ' ');
16752 if (r->intflags & PREGf_GPOS_SEEN)
16753 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
16754 if (r->intflags & PREGf_SKIP)
16755 PerlIO_printf(Perl_debug_log, "plus ");
16756 if (r->intflags & PREGf_IMPLICIT)
16757 PerlIO_printf(Perl_debug_log, "implicit ");
16758 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
16759 if (r->extflags & RXf_EVAL_SEEN)
16760 PerlIO_printf(Perl_debug_log, "with eval ");
16761 PerlIO_printf(Perl_debug_log, "\n");
16763 regdump_extflags("r->extflags: ",r->extflags);
16764 regdump_intflags("r->intflags: ",r->intflags);
16767 PERL_ARGS_ASSERT_REGDUMP;
16768 PERL_UNUSED_CONTEXT;
16769 PERL_UNUSED_ARG(r);
16770 #endif /* DEBUGGING */
16774 - regprop - printable representation of opcode, with run time support
16778 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
16783 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
16784 static const char * const anyofs[] = {
16785 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
16786 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
16787 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
16788 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
16789 || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15
16790 #error Need to adjust order of anyofs[]
16825 RXi_GET_DECL(prog,progi);
16826 GET_RE_DEBUG_FLAGS_DECL;
16828 PERL_ARGS_ASSERT_REGPROP;
16830 sv_setpvn(sv, "", 0);
16832 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
16833 /* It would be nice to FAIL() here, but this may be called from
16834 regexec.c, and it would be hard to supply pRExC_state. */
16835 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16836 (int)OP(o), (int)REGNODE_MAX);
16837 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
16839 k = PL_regkind[OP(o)];
16842 sv_catpvs(sv, " ");
16843 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
16844 * is a crude hack but it may be the best for now since
16845 * we have no flag "this EXACTish node was UTF-8"
16847 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
16848 PERL_PV_ESCAPE_UNI_DETECT |
16849 PERL_PV_ESCAPE_NONASCII |
16850 PERL_PV_PRETTY_ELLIPSES |
16851 PERL_PV_PRETTY_LTGT |
16852 PERL_PV_PRETTY_NOCLEAR
16854 } else if (k == TRIE) {
16855 /* print the details of the trie in dumpuntil instead, as
16856 * progi->data isn't available here */
16857 const char op = OP(o);
16858 const U32 n = ARG(o);
16859 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
16860 (reg_ac_data *)progi->data->data[n] :
16862 const reg_trie_data * const trie
16863 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
16865 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
16866 DEBUG_TRIE_COMPILE_r(
16867 Perl_sv_catpvf(aTHX_ sv,
16868 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
16869 (UV)trie->startstate,
16870 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
16871 (UV)trie->wordcount,
16874 (UV)TRIE_CHARCOUNT(trie),
16875 (UV)trie->uniquecharcount
16878 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16879 sv_catpvs(sv, "[");
16880 (void) put_charclass_bitmap_innards(sv,
16881 (IS_ANYOF_TRIE(op))
16883 : TRIE_BITMAP(trie),
16885 sv_catpvs(sv, "]");
16888 } else if (k == CURLY) {
16889 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16890 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16891 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16893 else if (k == WHILEM && o->flags) /* Ordinal/of */
16894 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16895 else if (k == REF || k == OPEN || k == CLOSE
16896 || k == GROUPP || OP(o)==ACCEPT)
16898 AV *name_list= NULL;
16899 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16900 if ( RXp_PAREN_NAMES(prog) ) {
16901 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16902 } else if ( pRExC_state ) {
16903 name_list= RExC_paren_name_list;
16906 if ( k != REF || (OP(o) < NREF)) {
16907 SV **name= av_fetch(name_list, ARG(o), 0 );
16909 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16912 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16913 I32 *nums=(I32*)SvPVX(sv_dat);
16914 SV **name= av_fetch(name_list, nums[0], 0 );
16917 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16918 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16919 (n ? "," : ""), (IV)nums[n]);
16921 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16925 if ( k == REF && reginfo) {
16926 U32 n = ARG(o); /* which paren pair */
16927 I32 ln = prog->offs[n].start;
16928 if (prog->lastparen < n || ln == -1)
16929 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16930 else if (ln == prog->offs[n].end)
16931 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16933 const char *s = reginfo->strbeg + ln;
16934 Perl_sv_catpvf(aTHX_ sv, ": ");
16935 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16936 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16939 } else if (k == GOSUB) {
16940 AV *name_list= NULL;
16941 if ( RXp_PAREN_NAMES(prog) ) {
16942 name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16943 } else if ( pRExC_state ) {
16944 name_list= RExC_paren_name_list;
16947 /* Paren and offset */
16948 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16950 SV **name= av_fetch(name_list, ARG(o), 0 );
16952 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16955 else if (k == VERB) {
16957 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16958 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16959 } else if (k == LOGICAL)
16960 /* 2: embedded, otherwise 1 */
16961 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16962 else if (k == ANYOF) {
16963 const U8 flags = ANYOF_FLAGS(o);
16965 SV* bitmap_invlist; /* Will hold what the bit map contains */
16968 if (OP(o) == ANYOFL)
16969 sv_catpvs(sv, "{loc}");
16970 if (flags & ANYOF_LOC_FOLD)
16971 sv_catpvs(sv, "{i}");
16972 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16973 if (flags & ANYOF_INVERT)
16974 sv_catpvs(sv, "^");
16976 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16978 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16981 /* output any special charclass tests (used entirely under use
16983 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16985 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16986 if (ANYOF_POSIXL_TEST(o,i)) {
16987 sv_catpv(sv, anyofs[i]);
16993 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16994 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16995 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16999 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
17000 if (flags & ANYOF_INVERT)
17001 /*make sure the invert info is in each */
17002 sv_catpvs(sv, "^");
17005 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
17006 sv_catpvs(sv, "{non-utf8-latin1-all}");
17009 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
17010 sv_catpvs(sv, "{above_bitmap_all}");
17012 if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
17013 SV *lv; /* Set if there is something outside the bit map. */
17014 bool byte_output = FALSE; /* If something has been output */
17015 SV *only_utf8_locale;
17017 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
17018 * is used to guarantee that nothing in the bitmap gets
17020 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
17021 &lv, &only_utf8_locale,
17023 if (lv && lv != &PL_sv_undef) {
17024 char *s = savesvpv(lv);
17025 char * const origs = s;
17027 while (*s && *s != '\n')
17031 const char * const t = ++s;
17033 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
17034 sv_catpvs(sv, "{outside bitmap}");
17037 sv_catpvs(sv, "{utf8}");
17041 sv_catpvs(sv, " ");
17047 /* Truncate very long output */
17048 if (s - origs > 256) {
17049 Perl_sv_catpvf(aTHX_ sv,
17051 (int) (s - origs - 1),
17057 else if (*s == '\t') {
17071 SvREFCNT_dec_NN(lv);
17074 if ((flags & ANYOF_LOC_FOLD)
17075 && only_utf8_locale
17076 && only_utf8_locale != &PL_sv_undef)
17079 int max_entries = 256;
17081 sv_catpvs(sv, "{utf8 locale}");
17082 invlist_iterinit(only_utf8_locale);
17083 while (invlist_iternext(only_utf8_locale,
17085 put_range(sv, start, end, FALSE);
17087 if (max_entries < 0) {
17088 sv_catpvs(sv, "...");
17092 invlist_iterfinish(only_utf8_locale);
17096 SvREFCNT_dec(bitmap_invlist);
17099 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
17101 else if (k == POSIXD || k == NPOSIXD) {
17102 U8 index = FLAGS(o) * 2;
17103 if (index < C_ARRAY_LENGTH(anyofs)) {
17104 if (*anyofs[index] != '[') {
17107 sv_catpv(sv, anyofs[index]);
17108 if (*anyofs[index] != '[') {
17113 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
17116 else if (k == BOUND || k == NBOUND) {
17117 /* Must be synced with order of 'bound_type' in regcomp.h */
17118 const char * const bounds[] = {
17119 "", /* Traditional */
17124 sv_catpv(sv, bounds[FLAGS(o)]);
17126 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
17127 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
17128 else if (OP(o) == SBOL)
17129 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
17131 PERL_UNUSED_CONTEXT;
17132 PERL_UNUSED_ARG(sv);
17133 PERL_UNUSED_ARG(o);
17134 PERL_UNUSED_ARG(prog);
17135 PERL_UNUSED_ARG(reginfo);
17136 PERL_UNUSED_ARG(pRExC_state);
17137 #endif /* DEBUGGING */
17143 Perl_re_intuit_string(pTHX_ REGEXP * const r)
17144 { /* Assume that RE_INTUIT is set */
17145 struct regexp *const prog = ReANY(r);
17146 GET_RE_DEBUG_FLAGS_DECL;
17148 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
17149 PERL_UNUSED_CONTEXT;
17153 const char * const s = SvPV_nolen_const(RX_UTF8(r)
17154 ? prog->check_utf8 : prog->check_substr);
17156 if (!PL_colorset) reginitcolors();
17157 PerlIO_printf(Perl_debug_log,
17158 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
17160 RX_UTF8(r) ? "utf8 " : "",
17161 PL_colors[5],PL_colors[0],
17164 (strlen(s) > 60 ? "..." : ""));
17167 /* use UTF8 check substring if regexp pattern itself is in UTF8 */
17168 return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
17174 handles refcounting and freeing the perl core regexp structure. When
17175 it is necessary to actually free the structure the first thing it
17176 does is call the 'free' method of the regexp_engine associated to
17177 the regexp, allowing the handling of the void *pprivate; member
17178 first. (This routine is not overridable by extensions, which is why
17179 the extensions free is called first.)
17181 See regdupe and regdupe_internal if you change anything here.
17183 #ifndef PERL_IN_XSUB_RE
17185 Perl_pregfree(pTHX_ REGEXP *r)
17191 Perl_pregfree2(pTHX_ REGEXP *rx)
17193 struct regexp *const r = ReANY(rx);
17194 GET_RE_DEBUG_FLAGS_DECL;
17196 PERL_ARGS_ASSERT_PREGFREE2;
17198 if (r->mother_re) {
17199 ReREFCNT_dec(r->mother_re);
17201 CALLREGFREE_PVT(rx); /* free the private data */
17202 SvREFCNT_dec(RXp_PAREN_NAMES(r));
17203 Safefree(r->xpv_len_u.xpvlenu_pv);
17206 SvREFCNT_dec(r->anchored_substr);
17207 SvREFCNT_dec(r->anchored_utf8);
17208 SvREFCNT_dec(r->float_substr);
17209 SvREFCNT_dec(r->float_utf8);
17210 Safefree(r->substrs);
17212 RX_MATCH_COPY_FREE(rx);
17213 #ifdef PERL_ANY_COW
17214 SvREFCNT_dec(r->saved_copy);
17217 SvREFCNT_dec(r->qr_anoncv);
17218 rx->sv_u.svu_rx = 0;
17223 This is a hacky workaround to the structural issue of match results
17224 being stored in the regexp structure which is in turn stored in
17225 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
17226 could be PL_curpm in multiple contexts, and could require multiple
17227 result sets being associated with the pattern simultaneously, such
17228 as when doing a recursive match with (??{$qr})
17230 The solution is to make a lightweight copy of the regexp structure
17231 when a qr// is returned from the code executed by (??{$qr}) this
17232 lightweight copy doesn't actually own any of its data except for
17233 the starp/end and the actual regexp structure itself.
17239 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
17241 struct regexp *ret;
17242 struct regexp *const r = ReANY(rx);
17243 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
17245 PERL_ARGS_ASSERT_REG_TEMP_COPY;
17248 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
17250 SvOK_off((SV *)ret_x);
17252 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
17253 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
17254 made both spots point to the same regexp body.) */
17255 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
17256 assert(!SvPVX(ret_x));
17257 ret_x->sv_u.svu_rx = temp->sv_any;
17258 temp->sv_any = NULL;
17259 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
17260 SvREFCNT_dec_NN(temp);
17261 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
17262 ing below will not set it. */
17263 SvCUR_set(ret_x, SvCUR(rx));
17266 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
17267 sv_force_normal(sv) is called. */
17269 ret = ReANY(ret_x);
17271 SvFLAGS(ret_x) |= SvUTF8(rx);
17272 /* We share the same string buffer as the original regexp, on which we
17273 hold a reference count, incremented when mother_re is set below.
17274 The string pointer is copied here, being part of the regexp struct.
17276 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
17277 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
17279 const I32 npar = r->nparens+1;
17280 Newx(ret->offs, npar, regexp_paren_pair);
17281 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17284 Newx(ret->substrs, 1, struct reg_substr_data);
17285 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17287 SvREFCNT_inc_void(ret->anchored_substr);
17288 SvREFCNT_inc_void(ret->anchored_utf8);
17289 SvREFCNT_inc_void(ret->float_substr);
17290 SvREFCNT_inc_void(ret->float_utf8);
17292 /* check_substr and check_utf8, if non-NULL, point to either their
17293 anchored or float namesakes, and don't hold a second reference. */
17295 RX_MATCH_COPIED_off(ret_x);
17296 #ifdef PERL_ANY_COW
17297 ret->saved_copy = NULL;
17299 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
17300 SvREFCNT_inc_void(ret->qr_anoncv);
17306 /* regfree_internal()
17308 Free the private data in a regexp. This is overloadable by
17309 extensions. Perl takes care of the regexp structure in pregfree(),
17310 this covers the *pprivate pointer which technically perl doesn't
17311 know about, however of course we have to handle the
17312 regexp_internal structure when no extension is in use.
17314 Note this is called before freeing anything in the regexp
17319 Perl_regfree_internal(pTHX_ REGEXP * const rx)
17321 struct regexp *const r = ReANY(rx);
17322 RXi_GET_DECL(r,ri);
17323 GET_RE_DEBUG_FLAGS_DECL;
17325 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
17331 SV *dsv= sv_newmortal();
17332 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
17333 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
17334 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
17335 PL_colors[4],PL_colors[5],s);
17338 #ifdef RE_TRACK_PATTERN_OFFSETS
17340 Safefree(ri->u.offsets); /* 20010421 MJD */
17342 if (ri->code_blocks) {
17344 for (n = 0; n < ri->num_code_blocks; n++)
17345 SvREFCNT_dec(ri->code_blocks[n].src_regex);
17346 Safefree(ri->code_blocks);
17350 int n = ri->data->count;
17353 /* If you add a ->what type here, update the comment in regcomp.h */
17354 switch (ri->data->what[n]) {
17360 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
17363 Safefree(ri->data->data[n]);
17369 { /* Aho Corasick add-on structure for a trie node.
17370 Used in stclass optimization only */
17372 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
17373 #ifdef USE_ITHREADS
17377 refcount = --aho->refcount;
17380 PerlMemShared_free(aho->states);
17381 PerlMemShared_free(aho->fail);
17382 /* do this last!!!! */
17383 PerlMemShared_free(ri->data->data[n]);
17384 /* we should only ever get called once, so
17385 * assert as much, and also guard the free
17386 * which /might/ happen twice. At the least
17387 * it will make code anlyzers happy and it
17388 * doesn't cost much. - Yves */
17389 assert(ri->regstclass);
17390 if (ri->regstclass) {
17391 PerlMemShared_free(ri->regstclass);
17392 ri->regstclass = 0;
17399 /* trie structure. */
17401 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
17402 #ifdef USE_ITHREADS
17406 refcount = --trie->refcount;
17409 PerlMemShared_free(trie->charmap);
17410 PerlMemShared_free(trie->states);
17411 PerlMemShared_free(trie->trans);
17413 PerlMemShared_free(trie->bitmap);
17415 PerlMemShared_free(trie->jump);
17416 PerlMemShared_free(trie->wordinfo);
17417 /* do this last!!!! */
17418 PerlMemShared_free(ri->data->data[n]);
17423 Perl_croak(aTHX_ "panic: regfree data code '%c'",
17424 ri->data->what[n]);
17427 Safefree(ri->data->what);
17428 Safefree(ri->data);
17434 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
17435 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
17436 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
17439 re_dup - duplicate a regexp.
17441 This routine is expected to clone a given regexp structure. It is only
17442 compiled under USE_ITHREADS.
17444 After all of the core data stored in struct regexp is duplicated
17445 the regexp_engine.dupe method is used to copy any private data
17446 stored in the *pprivate pointer. This allows extensions to handle
17447 any duplication it needs to do.
17449 See pregfree() and regfree_internal() if you change anything here.
17451 #if defined(USE_ITHREADS)
17452 #ifndef PERL_IN_XSUB_RE
17454 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
17458 const struct regexp *r = ReANY(sstr);
17459 struct regexp *ret = ReANY(dstr);
17461 PERL_ARGS_ASSERT_RE_DUP_GUTS;
17463 npar = r->nparens+1;
17464 Newx(ret->offs, npar, regexp_paren_pair);
17465 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
17467 if (ret->substrs) {
17468 /* Do it this way to avoid reading from *r after the StructCopy().
17469 That way, if any of the sv_dup_inc()s dislodge *r from the L1
17470 cache, it doesn't matter. */
17471 const bool anchored = r->check_substr
17472 ? r->check_substr == r->anchored_substr
17473 : r->check_utf8 == r->anchored_utf8;
17474 Newx(ret->substrs, 1, struct reg_substr_data);
17475 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
17477 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
17478 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
17479 ret->float_substr = sv_dup_inc(ret->float_substr, param);
17480 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
17482 /* check_substr and check_utf8, if non-NULL, point to either their
17483 anchored or float namesakes, and don't hold a second reference. */
17485 if (ret->check_substr) {
17487 assert(r->check_utf8 == r->anchored_utf8);
17488 ret->check_substr = ret->anchored_substr;
17489 ret->check_utf8 = ret->anchored_utf8;
17491 assert(r->check_substr == r->float_substr);
17492 assert(r->check_utf8 == r->float_utf8);
17493 ret->check_substr = ret->float_substr;
17494 ret->check_utf8 = ret->float_utf8;
17496 } else if (ret->check_utf8) {
17498 ret->check_utf8 = ret->anchored_utf8;
17500 ret->check_utf8 = ret->float_utf8;
17505 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
17506 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
17509 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
17511 if (RX_MATCH_COPIED(dstr))
17512 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
17514 ret->subbeg = NULL;
17515 #ifdef PERL_ANY_COW
17516 ret->saved_copy = NULL;
17519 /* Whether mother_re be set or no, we need to copy the string. We
17520 cannot refrain from copying it when the storage points directly to
17521 our mother regexp, because that's
17522 1: a buffer in a different thread
17523 2: something we no longer hold a reference on
17524 so we need to copy it locally. */
17525 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
17526 ret->mother_re = NULL;
17528 #endif /* PERL_IN_XSUB_RE */
17533 This is the internal complement to regdupe() which is used to copy
17534 the structure pointed to by the *pprivate pointer in the regexp.
17535 This is the core version of the extension overridable cloning hook.
17536 The regexp structure being duplicated will be copied by perl prior
17537 to this and will be provided as the regexp *r argument, however
17538 with the /old/ structures pprivate pointer value. Thus this routine
17539 may override any copying normally done by perl.
17541 It returns a pointer to the new regexp_internal structure.
17545 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
17548 struct regexp *const r = ReANY(rx);
17549 regexp_internal *reti;
17551 RXi_GET_DECL(r,ri);
17553 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
17557 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
17558 char, regexp_internal);
17559 Copy(ri->program, reti->program, len+1, regnode);
17561 reti->num_code_blocks = ri->num_code_blocks;
17562 if (ri->code_blocks) {
17564 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
17565 struct reg_code_block);
17566 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
17567 struct reg_code_block);
17568 for (n = 0; n < ri->num_code_blocks; n++)
17569 reti->code_blocks[n].src_regex = (REGEXP*)
17570 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
17573 reti->code_blocks = NULL;
17575 reti->regstclass = NULL;
17578 struct reg_data *d;
17579 const int count = ri->data->count;
17582 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
17583 char, struct reg_data);
17584 Newx(d->what, count, U8);
17587 for (i = 0; i < count; i++) {
17588 d->what[i] = ri->data->what[i];
17589 switch (d->what[i]) {
17590 /* see also regcomp.h and regfree_internal() */
17591 case 'a': /* actually an AV, but the dup function is identical. */
17595 case 'u': /* actually an HV, but the dup function is identical. */
17596 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
17599 /* This is cheating. */
17600 Newx(d->data[i], 1, regnode_ssc);
17601 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
17602 reti->regstclass = (regnode*)d->data[i];
17605 /* Trie stclasses are readonly and can thus be shared
17606 * without duplication. We free the stclass in pregfree
17607 * when the corresponding reg_ac_data struct is freed.
17609 reti->regstclass= ri->regstclass;
17613 ((reg_trie_data*)ri->data->data[i])->refcount++;
17618 d->data[i] = ri->data->data[i];
17621 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
17622 ri->data->what[i]);
17631 reti->name_list_idx = ri->name_list_idx;
17633 #ifdef RE_TRACK_PATTERN_OFFSETS
17634 if (ri->u.offsets) {
17635 Newx(reti->u.offsets, 2*len+1, U32);
17636 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
17639 SetProgLen(reti,len);
17642 return (void*)reti;
17645 #endif /* USE_ITHREADS */
17647 #ifndef PERL_IN_XSUB_RE
17650 - regnext - dig the "next" pointer out of a node
17653 Perl_regnext(pTHX_ regnode *p)
17660 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
17661 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
17662 (int)OP(p), (int)REGNODE_MAX);
17665 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
17674 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
17677 STRLEN l1 = strlen(pat1);
17678 STRLEN l2 = strlen(pat2);
17681 const char *message;
17683 PERL_ARGS_ASSERT_RE_CROAK2;
17689 Copy(pat1, buf, l1 , char);
17690 Copy(pat2, buf + l1, l2 , char);
17691 buf[l1 + l2] = '\n';
17692 buf[l1 + l2 + 1] = '\0';
17693 va_start(args, pat2);
17694 msv = vmess(buf, &args);
17696 message = SvPV_const(msv,l1);
17699 Copy(message, buf, l1 , char);
17700 /* l1-1 to avoid \n */
17701 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
17704 /* XXX Here's a total kludge. But we need to re-enter for swash routines. */
17706 #ifndef PERL_IN_XSUB_RE
17708 Perl_save_re_context(pTHX)
17713 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
17716 const REGEXP * const rx = PM_GETRE(PL_curpm);
17718 nparens = RX_NPARENS(rx);
17721 /* RT #124109. This is a complete hack; in the SWASHNEW case we know
17722 * that PL_curpm will be null, but that utf8.pm and the modules it
17723 * loads will only use $1..$3.
17724 * The t/porting/re_context.t test file checks this assumption.
17729 for (i = 1; i <= nparens; i++) {
17730 char digits[TYPE_CHARS(long)];
17731 const STRLEN len = my_snprintf(digits, sizeof(digits),
17733 GV *const *const gvp
17734 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
17737 GV * const gv = *gvp;
17738 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
17748 S_put_code_point(pTHX_ SV *sv, UV c)
17750 PERL_ARGS_ASSERT_PUT_CODE_POINT;
17753 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
17755 else if (isPRINT(c)) {
17756 const char string = (char) c;
17757 if (isBACKSLASHED_PUNCT(c))
17758 sv_catpvs(sv, "\\");
17759 sv_catpvn(sv, &string, 1);
17762 const char * const mnemonic = cntrl_to_mnemonic((char) c);
17764 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
17767 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
17772 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
17775 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
17777 /* Appends to 'sv' a displayable version of the range of code points from
17778 * 'start' to 'end'. It assumes that only ASCII printables are displayable
17779 * as-is (though some of these will be escaped by put_code_point()). */
17781 const unsigned int min_range_count = 3;
17783 assert(start <= end);
17785 PERL_ARGS_ASSERT_PUT_RANGE;
17787 while (start <= end) {
17789 const char * format;
17791 if (end - start < min_range_count) {
17793 /* Individual chars in short ranges */
17794 for (; start <= end; start++) {
17795 put_code_point(sv, start);
17800 /* If permitted by the input options, and there is a possibility that
17801 * this range contains a printable literal, look to see if there is
17803 if (allow_literals && start <= MAX_PRINT_A) {
17805 /* If the range begin isn't an ASCII printable, effectively split
17806 * the range into two parts:
17807 * 1) the portion before the first such printable,
17809 * and output them separately. */
17810 if (! isPRINT_A(start)) {
17811 UV temp_end = start + 1;
17813 /* There is no point looking beyond the final possible
17814 * printable, in MAX_PRINT_A */
17815 UV max = MIN(end, MAX_PRINT_A);
17817 while (temp_end <= max && ! isPRINT_A(temp_end)) {
17821 /* Here, temp_end points to one beyond the first printable if
17822 * found, or to one beyond 'max' if not. If none found, make
17823 * sure that we use the entire range */
17824 if (temp_end > MAX_PRINT_A) {
17825 temp_end = end + 1;
17828 /* Output the first part of the split range, the part that
17829 * doesn't have printables, with no looking for literals
17830 * (otherwise we would infinitely recurse) */
17831 put_range(sv, start, temp_end - 1, FALSE);
17833 /* The 2nd part of the range (if any) starts here. */
17836 /* We continue instead of dropping down because even if the 2nd
17837 * part is non-empty, it could be so short that we want to
17838 * output it specially, as tested for at the top of this loop.
17843 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
17844 * output a sub-range of just the digits or letters, then process
17845 * the remaining portion as usual. */
17846 if (isALPHANUMERIC_A(start)) {
17847 UV mask = (isDIGIT_A(start))
17852 UV temp_end = start + 1;
17854 /* Find the end of the sub-range that includes just the
17855 * characters in the same class as the first character in it */
17856 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
17861 /* For short ranges, don't duplicate the code above to output
17862 * them; just call recursively */
17863 if (temp_end - start < min_range_count) {
17864 put_range(sv, start, temp_end, FALSE);
17866 else { /* Output as a range */
17867 put_code_point(sv, start);
17868 sv_catpvs(sv, "-");
17869 put_code_point(sv, temp_end);
17871 start = temp_end + 1;
17875 /* We output any other printables as individual characters */
17876 if (isPUNCT_A(start) || isSPACE_A(start)) {
17877 while (start <= end && (isPUNCT_A(start)
17878 || isSPACE_A(start)))
17880 put_code_point(sv, start);
17885 } /* End of looking for literals */
17887 /* Here is not to output as a literal. Some control characters have
17888 * mnemonic names. Split off any of those at the beginning and end of
17889 * the range to print mnemonically. It isn't possible for many of
17890 * these to be in a row, so this won't overwhelm with output */
17891 while (isMNEMONIC_CNTRL(start) && start <= end) {
17892 put_code_point(sv, start);
17895 if (start < end && isMNEMONIC_CNTRL(end)) {
17897 /* Here, the final character in the range has a mnemonic name.
17898 * Work backwards from the end to find the final non-mnemonic */
17899 UV temp_end = end - 1;
17900 while (isMNEMONIC_CNTRL(temp_end)) {
17904 /* And separately output the range that doesn't have mnemonics */
17905 put_range(sv, start, temp_end, FALSE);
17907 /* Then output the mnemonic trailing controls */
17908 start = temp_end + 1;
17909 while (start <= end) {
17910 put_code_point(sv, start);
17916 /* As a final resort, output the range or subrange as hex. */
17918 this_end = (end < NUM_ANYOF_CODE_POINTS)
17920 : NUM_ANYOF_CODE_POINTS - 1;
17921 format = (this_end < 256)
17922 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
17923 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
17924 GCC_DIAG_IGNORE(-Wformat-nonliteral);
17925 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
17932 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
17934 /* Appends to 'sv' a displayable version of the innards of the bracketed
17935 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
17936 * output anything, and bitmap_invlist, if not NULL, will point to an
17937 * inversion list of what is in the bit map */
17941 unsigned int punct_count = 0;
17942 SV* invlist = NULL;
17943 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17944 bool allow_literals = TRUE;
17946 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17948 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17950 /* Worst case is exactly every-other code point is in the list */
17951 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17953 /* Convert the bit map to an inversion list, keeping track of how many
17954 * ASCII puncts are set, including an extra amount for the backslashed
17956 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17957 if (BITMAP_TEST(bitmap, i)) {
17958 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17959 if (isPUNCT_A(i)) {
17961 if isBACKSLASHED_PUNCT(i) {
17968 /* Nothing to output */
17969 if (_invlist_len(*invlist_ptr) == 0) {
17970 SvREFCNT_dec(invlist);
17974 /* Generally, it is more readable if printable characters are output as
17975 * literals, but if a range (nearly) spans all of them, it's best to output
17976 * it as a single range. This code will use a single range if all but 2
17977 * printables are in it */
17978 invlist_iterinit(*invlist_ptr);
17979 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17981 /* If range starts beyond final printable, it doesn't have any in it */
17982 if (start > MAX_PRINT_A) {
17986 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17987 * all but two, the range must start and end no later than 2 from
17989 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17990 if (end > MAX_PRINT_A) {
17996 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17997 allow_literals = FALSE;
18002 invlist_iterfinish(*invlist_ptr);
18004 /* The legibility of the output depends mostly on how many punctuation
18005 * characters are output. There are 32 possible ASCII ones, and some have
18006 * an additional backslash, bringing it to currently 36, so if any more
18007 * than 18 are to be output, we can instead output it as its complement,
18008 * yielding fewer puncts, and making it more legible. But give some weight
18009 * to the fact that outputting it as a complement is less legible than a
18010 * straight output, so don't complement unless we are somewhat over the 18
18012 if (allow_literals && punct_count > 22) {
18013 sv_catpvs(sv, "^");
18015 /* Add everything remaining to the list, so when we invert it just
18016 * below, it will be excluded */
18017 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
18018 _invlist_invert(*invlist_ptr);
18021 /* Here we have figured things out. Output each range */
18022 invlist_iterinit(*invlist_ptr);
18023 while (invlist_iternext(*invlist_ptr, &start, &end)) {
18024 if (start >= NUM_ANYOF_CODE_POINTS) {
18027 put_range(sv, start, end, allow_literals);
18029 invlist_iterfinish(*invlist_ptr);
18034 #define CLEAR_OPTSTART \
18035 if (optstart) STMT_START { \
18036 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
18037 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
18041 #define DUMPUNTIL(b,e) \
18043 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
18045 STATIC const regnode *
18046 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
18047 const regnode *last, const regnode *plast,
18048 SV* sv, I32 indent, U32 depth)
18050 U8 op = PSEUDO; /* Arbitrary non-END op. */
18051 const regnode *next;
18052 const regnode *optstart= NULL;
18054 RXi_GET_DECL(r,ri);
18055 GET_RE_DEBUG_FLAGS_DECL;
18057 PERL_ARGS_ASSERT_DUMPUNTIL;
18059 #ifdef DEBUG_DUMPUNTIL
18060 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
18061 last ? last-start : 0,plast ? plast-start : 0);
18064 if (plast && plast < last)
18067 while (PL_regkind[op] != END && (!last || node < last)) {
18069 /* While that wasn't END last time... */
18072 if (op == CLOSE || op == WHILEM)
18074 next = regnext((regnode *)node);
18077 if (OP(node) == OPTIMIZED) {
18078 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
18085 regprop(r, sv, node, NULL, NULL);
18086 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
18087 (int)(2*indent + 1), "", SvPVX_const(sv));
18089 if (OP(node) != OPTIMIZED) {
18090 if (next == NULL) /* Next ptr. */
18091 PerlIO_printf(Perl_debug_log, " (0)");
18092 else if (PL_regkind[(U8)op] == BRANCH
18093 && PL_regkind[OP(next)] != BRANCH )
18094 PerlIO_printf(Perl_debug_log, " (FAIL)");
18096 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
18097 (void)PerlIO_putc(Perl_debug_log, '\n');
18101 if (PL_regkind[(U8)op] == BRANCHJ) {
18104 const regnode *nnode = (OP(next) == LONGJMP
18105 ? regnext((regnode *)next)
18107 if (last && nnode > last)
18109 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
18112 else if (PL_regkind[(U8)op] == BRANCH) {
18114 DUMPUNTIL(NEXTOPER(node), next);
18116 else if ( PL_regkind[(U8)op] == TRIE ) {
18117 const regnode *this_trie = node;
18118 const char op = OP(node);
18119 const U32 n = ARG(node);
18120 const reg_ac_data * const ac = op>=AHOCORASICK ?
18121 (reg_ac_data *)ri->data->data[n] :
18123 const reg_trie_data * const trie =
18124 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
18126 AV *const trie_words
18127 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
18129 const regnode *nextbranch= NULL;
18132 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
18133 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
18135 PerlIO_printf(Perl_debug_log, "%*s%s ",
18136 (int)(2*(indent+3)), "",
18138 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
18139 SvCUR(*elem_ptr), 60,
18140 PL_colors[0], PL_colors[1],
18142 ? PERL_PV_ESCAPE_UNI
18144 | PERL_PV_PRETTY_ELLIPSES
18145 | PERL_PV_PRETTY_LTGT
18150 U16 dist= trie->jump[word_idx+1];
18151 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
18152 (UV)((dist ? this_trie + dist : next) - start));
18155 nextbranch= this_trie + trie->jump[0];
18156 DUMPUNTIL(this_trie + dist, nextbranch);
18158 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
18159 nextbranch= regnext((regnode *)nextbranch);
18161 PerlIO_printf(Perl_debug_log, "\n");
18164 if (last && next > last)
18169 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
18170 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
18171 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
18173 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
18175 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
18177 else if ( op == PLUS || op == STAR) {
18178 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
18180 else if (PL_regkind[(U8)op] == ANYOF) {
18181 /* arglen 1 + class block */
18182 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
18183 ? ANYOF_POSIXL_SKIP
18185 node = NEXTOPER(node);
18187 else if (PL_regkind[(U8)op] == EXACT) {
18188 /* Literal string, where present. */
18189 node += NODE_SZ_STR(node) - 1;
18190 node = NEXTOPER(node);
18193 node = NEXTOPER(node);
18194 node += regarglen[(U8)op];
18196 if (op == CURLYX || op == OPEN)
18200 #ifdef DEBUG_DUMPUNTIL
18201 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
18206 #endif /* DEBUGGING */
18209 * ex: set ts=8 sts=4 sw=4 et: