5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
10 /* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
12 * a regular expression.
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
20 /* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
24 /* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
29 /* The names of the functions have been changed from regcomp and
30 * regexec to pregcomp and pregexec in order to avoid conflicts
31 * with the POSIX routines of the same names.
34 #ifdef PERL_EXT_RE_BUILD
39 * pregcomp and pregexec -- regsub and regerror are not used in perl
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
59 **** Alterations to Henry's code are...
61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
74 #define PERL_IN_REGCOMP_C
77 #ifndef PERL_IN_XSUB_RE
82 #ifdef PERL_IN_XSUB_RE
84 EXTERN_C const struct regexp_engine my_reg_engine;
89 #include "dquote_static.c"
90 #include "charclass_invlists.h"
91 #include "inline_invlist.c"
92 #include "unicode_constants.h"
94 #define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
96 #define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
98 #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
99 #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
102 #define STATIC static
106 struct RExC_state_t {
107 U32 flags; /* RXf_* are we folding, multilining? */
108 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
109 char *precomp; /* uncompiled string. */
110 REGEXP *rx_sv; /* The SV that is the regexp. */
111 regexp *rx; /* perl core regexp structure */
112 regexp_internal *rxi; /* internal data for regexp object
114 char *start; /* Start of input for compile */
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 SSize_t whilem_seen; /* number of WHILEM in this expr */
118 regnode *emit_start; /* Start of emitted-code area */
119 regnode *emit_bound; /* First regnode outside of the
121 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
122 implies compiling, so don't emit */
123 regnode_ssc emit_dummy; /* placeholder for emit to point to;
124 large enough for the largest
125 non-EXACTish node, so can use it as
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
130 SSize_t size; /* Code size. */
131 I32 npar; /* Capture buffer count, (OPEN) plus
132 one. ("par" 0 is the whole
134 I32 nestroot; /* root parens we are in - used by
138 regnode **open_parens; /* pointers to open parens */
139 regnode **close_parens; /* pointers to close parens */
140 regnode *opend; /* END node in program */
141 I32 utf8; /* whether the pattern is utf8 or not */
142 I32 orig_utf8; /* whether the pattern was originally in utf8 */
143 /* XXX use this for future optimisation of case
144 * where pattern must be upgraded to utf8. */
145 I32 uni_semantics; /* If a d charset modifier should use unicode
146 rules, even if the pattern is not in
148 HV *paren_names; /* Paren names */
150 regnode **recurse; /* Recurse regops */
151 I32 recurse_count; /* Number of recurse regops */
152 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
154 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
158 I32 override_recoding;
159 I32 in_multi_char_class;
160 struct reg_code_block *code_blocks; /* positions of literal (?{})
162 int num_code_blocks; /* size of code_blocks[] */
163 int code_index; /* next code_blocks[] slot */
164 SSize_t maxlen; /* mininum possible number of chars in string to match */
165 #ifdef ADD_TO_REGEXEC
166 char *starttry; /* -Dr: where regtry was called. */
167 #define RExC_starttry (pRExC_state->starttry)
169 SV *runtime_code_qr; /* qr with the runtime code blocks */
171 const char *lastparse;
173 AV *paren_name_list; /* idx -> name */
174 #define RExC_lastparse (pRExC_state->lastparse)
175 #define RExC_lastnum (pRExC_state->lastnum)
176 #define RExC_paren_name_list (pRExC_state->paren_name_list)
180 #define RExC_flags (pRExC_state->flags)
181 #define RExC_pm_flags (pRExC_state->pm_flags)
182 #define RExC_precomp (pRExC_state->precomp)
183 #define RExC_rx_sv (pRExC_state->rx_sv)
184 #define RExC_rx (pRExC_state->rx)
185 #define RExC_rxi (pRExC_state->rxi)
186 #define RExC_start (pRExC_state->start)
187 #define RExC_end (pRExC_state->end)
188 #define RExC_parse (pRExC_state->parse)
189 #define RExC_whilem_seen (pRExC_state->whilem_seen)
190 #ifdef RE_TRACK_PATTERN_OFFSETS
191 #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
194 #define RExC_emit (pRExC_state->emit)
195 #define RExC_emit_dummy (pRExC_state->emit_dummy)
196 #define RExC_emit_start (pRExC_state->emit_start)
197 #define RExC_emit_bound (pRExC_state->emit_bound)
198 #define RExC_naughty (pRExC_state->naughty)
199 #define RExC_sawback (pRExC_state->sawback)
200 #define RExC_seen (pRExC_state->seen)
201 #define RExC_size (pRExC_state->size)
202 #define RExC_maxlen (pRExC_state->maxlen)
203 #define RExC_npar (pRExC_state->npar)
204 #define RExC_nestroot (pRExC_state->nestroot)
205 #define RExC_extralen (pRExC_state->extralen)
206 #define RExC_seen_zerolen (pRExC_state->seen_zerolen)
207 #define RExC_utf8 (pRExC_state->utf8)
208 #define RExC_uni_semantics (pRExC_state->uni_semantics)
209 #define RExC_orig_utf8 (pRExC_state->orig_utf8)
210 #define RExC_open_parens (pRExC_state->open_parens)
211 #define RExC_close_parens (pRExC_state->close_parens)
212 #define RExC_opend (pRExC_state->opend)
213 #define RExC_paren_names (pRExC_state->paren_names)
214 #define RExC_recurse (pRExC_state->recurse)
215 #define RExC_recurse_count (pRExC_state->recurse_count)
216 #define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
217 #define RExC_study_chunk_recursed_bytes \
218 (pRExC_state->study_chunk_recursed_bytes)
219 #define RExC_in_lookbehind (pRExC_state->in_lookbehind)
220 #define RExC_contains_locale (pRExC_state->contains_locale)
221 #define RExC_contains_i (pRExC_state->contains_i)
222 #define RExC_override_recoding (pRExC_state->override_recoding)
223 #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
226 #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
227 #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
228 ((*s) == '{' && regcurly(s)))
231 * Flags to be passed up and down.
233 #define WORST 0 /* Worst case. */
234 #define HASWIDTH 0x01 /* Known to match non-null strings. */
236 /* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
237 * character. (There needs to be a case: in the switch statement in regexec.c
238 * for any node marked SIMPLE.) Note that this is not the same thing as
241 #define SPSTART 0x04 /* Starts with * or + */
242 #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
243 #define TRYAGAIN 0x10 /* Weeded out a declaration. */
244 #define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
246 #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
248 /* whether trie related optimizations are enabled */
249 #if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
250 #define TRIE_STUDY_OPT
251 #define FULL_TRIE_STUDY
257 #define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
258 #define PBITVAL(paren) (1 << ((paren) & 7))
259 #define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
260 #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
261 #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
263 #define REQUIRE_UTF8 STMT_START { \
265 *flagp = RESTART_UTF8; \
270 /* This converts the named class defined in regcomp.h to its equivalent class
271 * number defined in handy.h. */
272 #define namedclass_to_classnum(class) ((int) ((class) / 2))
273 #define classnum_to_namedclass(classnum) ((classnum) * 2)
275 #define _invlist_union_complement_2nd(a, b, output) \
276 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
277 #define _invlist_intersection_complement_2nd(a, b, output) \
278 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
280 /* About scan_data_t.
282 During optimisation we recurse through the regexp program performing
283 various inplace (keyhole style) optimisations. In addition study_chunk
284 and scan_commit populate this data structure with information about
285 what strings MUST appear in the pattern. We look for the longest
286 string that must appear at a fixed location, and we look for the
287 longest string that may appear at a floating location. So for instance
292 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
293 strings (because they follow a .* construct). study_chunk will identify
294 both FOO and BAR as being the longest fixed and floating strings respectively.
296 The strings can be composites, for instance
300 will result in a composite fixed substring 'foo'.
302 For each string some basic information is maintained:
304 - offset or min_offset
305 This is the position the string must appear at, or not before.
306 It also implicitly (when combined with minlenp) tells us how many
307 characters must match before the string we are searching for.
308 Likewise when combined with minlenp and the length of the string it
309 tells us how many characters must appear after the string we have
313 Only used for floating strings. This is the rightmost point that
314 the string can appear at. If set to SSize_t_MAX it indicates that the
315 string can occur infinitely far to the right.
318 A pointer to the minimum number of characters of the pattern that the
319 string was found inside. This is important as in the case of positive
320 lookahead or positive lookbehind we can have multiple patterns
325 The minimum length of the pattern overall is 3, the minimum length
326 of the lookahead part is 3, but the minimum length of the part that
327 will actually match is 1. So 'FOO's minimum length is 3, but the
328 minimum length for the F is 1. This is important as the minimum length
329 is used to determine offsets in front of and behind the string being
330 looked for. Since strings can be composites this is the length of the
331 pattern at the time it was committed with a scan_commit. Note that
332 the length is calculated by study_chunk, so that the minimum lengths
333 are not known until the full pattern has been compiled, thus the
334 pointer to the value.
338 In the case of lookbehind the string being searched for can be
339 offset past the start point of the final matching string.
340 If this value was just blithely removed from the min_offset it would
341 invalidate some of the calculations for how many chars must match
342 before or after (as they are derived from min_offset and minlen and
343 the length of the string being searched for).
344 When the final pattern is compiled and the data is moved from the
345 scan_data_t structure into the regexp structure the information
346 about lookbehind is factored in, with the information that would
347 have been lost precalculated in the end_shift field for the
350 The fields pos_min and pos_delta are used to store the minimum offset
351 and the delta to the maximum offset at the current point in the pattern.
355 typedef struct scan_data_t {
356 /*I32 len_min; unused */
357 /*I32 len_delta; unused */
361 SSize_t last_end; /* min value, <0 unless valid. */
362 SSize_t last_start_min;
363 SSize_t last_start_max;
364 SV **longest; /* Either &l_fixed, or &l_float. */
365 SV *longest_fixed; /* longest fixed string found in pattern */
366 SSize_t offset_fixed; /* offset where it starts */
367 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
368 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
369 SV *longest_float; /* longest floating string found in pattern */
370 SSize_t offset_float_min; /* earliest point in string it can appear */
371 SSize_t offset_float_max; /* latest point in string it can appear */
372 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
373 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
376 SSize_t *last_closep;
377 regnode_ssc *start_class;
381 * Forward declarations for pregcomp()'s friends.
384 static const scan_data_t zero_scan_data =
385 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
387 #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
388 #define SF_BEFORE_SEOL 0x0001
389 #define SF_BEFORE_MEOL 0x0002
390 #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
391 #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
393 #define SF_FIX_SHIFT_EOL (+2)
394 #define SF_FL_SHIFT_EOL (+4)
396 #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
397 #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
399 #define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
400 #define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
401 #define SF_IS_INF 0x0040
402 #define SF_HAS_PAR 0x0080
403 #define SF_IN_PAR 0x0100
404 #define SF_HAS_EVAL 0x0200
405 #define SCF_DO_SUBSTR 0x0400
406 #define SCF_DO_STCLASS_AND 0x0800
407 #define SCF_DO_STCLASS_OR 0x1000
408 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
409 #define SCF_WHILEM_VISITED_POS 0x2000
411 #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
412 #define SCF_SEEN_ACCEPT 0x8000
413 #define SCF_TRIE_DOING_RESTUDY 0x10000
415 #define UTF cBOOL(RExC_utf8)
417 /* The enums for all these are ordered so things work out correctly */
418 #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
419 #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
420 == REGEX_DEPENDS_CHARSET)
421 #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
422 #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
423 >= REGEX_UNICODE_CHARSET)
424 #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
425 == REGEX_ASCII_RESTRICTED_CHARSET)
426 #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
427 >= REGEX_ASCII_RESTRICTED_CHARSET)
428 #define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
429 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
431 #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
433 /* For programs that want to be strictly Unicode compatible by dying if any
434 * attempt is made to match a non-Unicode code point against a Unicode
436 #define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
438 #define OOB_NAMEDCLASS -1
440 /* There is no code point that is out-of-bounds, so this is problematic. But
441 * its only current use is to initialize a variable that is always set before
443 #define OOB_UNICODE 0xDEADBEEF
445 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
446 #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
449 /* length of regex to show in messages that don't mark a position within */
450 #define RegexLengthToShowInErrorMessages 127
453 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
454 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
455 * op/pragma/warn/regcomp.
457 #define MARKER1 "<-- HERE" /* marker as it appears in the description */
458 #define MARKER2 " <-- HERE " /* marker as it appears within the regex */
460 #define REPORT_LOCATION " in regex; marked by " MARKER1 \
461 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
463 #define REPORT_LOCATION_ARGS(offset) \
464 UTF8fARG(UTF, offset, RExC_precomp), \
465 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
468 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
469 * arg. Show regex, up to a maximum length. If it's too long, chop and add
472 #define _FAIL(code) STMT_START { \
473 const char *ellipses = ""; \
474 IV len = RExC_end - RExC_precomp; \
477 SAVEFREESV(RExC_rx_sv); \
478 if (len > RegexLengthToShowInErrorMessages) { \
479 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
480 len = RegexLengthToShowInErrorMessages - 10; \
486 #define FAIL(msg) _FAIL( \
487 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
488 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
490 #define FAIL2(msg,arg) _FAIL( \
491 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
492 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
495 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
497 #define Simple_vFAIL(m) STMT_START { \
498 const IV offset = RExC_parse - RExC_precomp; \
499 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
500 m, REPORT_LOCATION_ARGS(offset)); \
504 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
506 #define vFAIL(m) STMT_START { \
508 SAVEFREESV(RExC_rx_sv); \
513 * Like Simple_vFAIL(), but accepts two arguments.
515 #define Simple_vFAIL2(m,a1) STMT_START { \
516 const IV offset = RExC_parse - RExC_precomp; \
517 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
518 REPORT_LOCATION_ARGS(offset)); \
522 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
524 #define vFAIL2(m,a1) STMT_START { \
526 SAVEFREESV(RExC_rx_sv); \
527 Simple_vFAIL2(m, a1); \
532 * Like Simple_vFAIL(), but accepts three arguments.
534 #define Simple_vFAIL3(m, a1, a2) STMT_START { \
535 const IV offset = RExC_parse - RExC_precomp; \
536 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
537 REPORT_LOCATION_ARGS(offset)); \
541 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
543 #define vFAIL3(m,a1,a2) STMT_START { \
545 SAVEFREESV(RExC_rx_sv); \
546 Simple_vFAIL3(m, a1, a2); \
550 * Like Simple_vFAIL(), but accepts four arguments.
552 #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
553 const IV offset = RExC_parse - RExC_precomp; \
554 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
555 REPORT_LOCATION_ARGS(offset)); \
558 #define vFAIL4(m,a1,a2,a3) STMT_START { \
560 SAVEFREESV(RExC_rx_sv); \
561 Simple_vFAIL4(m, a1, a2, a3); \
564 /* A specialized version of vFAIL2 that works with UTF8f */
565 #define vFAIL2utf8f(m, a1) STMT_START { \
566 const IV offset = RExC_parse - RExC_precomp; \
568 SAVEFREESV(RExC_rx_sv); \
569 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
570 REPORT_LOCATION_ARGS(offset)); \
573 /* These have asserts in them because of [perl #122671] Many warnings in
574 * regcomp.c can occur twice. If they get output in pass1 and later in that
575 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
576 * would get output again. So they should be output in pass2, and these
577 * asserts make sure new warnings follow that paradigm. */
579 /* m is not necessarily a "literal string", in this macro */
580 #define reg_warn_non_literal_string(loc, m) STMT_START { \
581 const IV offset = loc - RExC_precomp; \
582 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
583 m, REPORT_LOCATION_ARGS(offset)); \
586 #define ckWARNreg(loc,m) STMT_START { \
587 const IV offset = loc - RExC_precomp; \
588 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
589 REPORT_LOCATION_ARGS(offset)); \
592 #define vWARN_dep(loc, m) STMT_START { \
593 const IV offset = loc - RExC_precomp; \
594 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
595 REPORT_LOCATION_ARGS(offset)); \
598 #define ckWARNdep(loc,m) STMT_START { \
599 const IV offset = loc - RExC_precomp; \
600 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
602 REPORT_LOCATION_ARGS(offset)); \
605 #define ckWARNregdep(loc,m) STMT_START { \
606 const IV offset = loc - RExC_precomp; \
607 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
609 REPORT_LOCATION_ARGS(offset)); \
612 #define ckWARN2reg_d(loc,m, a1) STMT_START { \
613 const IV offset = loc - RExC_precomp; \
614 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
616 a1, REPORT_LOCATION_ARGS(offset)); \
619 #define ckWARN2reg(loc, m, a1) STMT_START { \
620 const IV offset = loc - RExC_precomp; \
621 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
622 a1, REPORT_LOCATION_ARGS(offset)); \
625 #define vWARN3(loc, m, a1, a2) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
628 a1, a2, REPORT_LOCATION_ARGS(offset)); \
631 #define ckWARN3reg(loc, m, a1, a2) STMT_START { \
632 const IV offset = loc - RExC_precomp; \
633 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
634 a1, a2, REPORT_LOCATION_ARGS(offset)); \
637 #define vWARN4(loc, m, a1, a2, a3) STMT_START { \
638 const IV offset = loc - RExC_precomp; \
639 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
640 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
643 #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
644 const IV offset = loc - RExC_precomp; \
645 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
646 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
649 #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
650 const IV offset = loc - RExC_precomp; \
651 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
652 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
656 /* Allow for side effects in s */
657 #define REGC(c,s) STMT_START { \
658 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
661 /* Macros for recording node offsets. 20001227 mjd@plover.com
662 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
663 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
664 * Element 0 holds the number n.
665 * Position is 1 indexed.
667 #ifndef RE_TRACK_PATTERN_OFFSETS
668 #define Set_Node_Offset_To_R(node,byte)
669 #define Set_Node_Offset(node,byte)
670 #define Set_Cur_Node_Offset
671 #define Set_Node_Length_To_R(node,len)
672 #define Set_Node_Length(node,len)
673 #define Set_Node_Cur_Length(node,start)
674 #define Node_Offset(n)
675 #define Node_Length(n)
676 #define Set_Node_Offset_Length(node,offset,len)
677 #define ProgLen(ri) ri->u.proglen
678 #define SetProgLen(ri,x) ri->u.proglen = x
680 #define ProgLen(ri) ri->u.offsets[0]
681 #define SetProgLen(ri,x) ri->u.offsets[0] = x
682 #define Set_Node_Offset_To_R(node,byte) STMT_START { \
684 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
685 __LINE__, (int)(node), (int)(byte))); \
687 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
690 RExC_offsets[2*(node)-1] = (byte); \
695 #define Set_Node_Offset(node,byte) \
696 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
697 #define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
699 #define Set_Node_Length_To_R(node,len) STMT_START { \
701 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
702 __LINE__, (int)(node), (int)(len))); \
704 Perl_croak(aTHX_ "value of node is %d in Length macro", \
707 RExC_offsets[2*(node)] = (len); \
712 #define Set_Node_Length(node,len) \
713 Set_Node_Length_To_R((node)-RExC_emit_start, len)
714 #define Set_Node_Cur_Length(node, start) \
715 Set_Node_Length(node, RExC_parse - start)
717 /* Get offsets and lengths */
718 #define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
719 #define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
721 #define Set_Node_Offset_Length(node,offset,len) STMT_START { \
722 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
723 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
727 #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
728 #define EXPERIMENTAL_INPLACESCAN
729 #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
731 #define DEBUG_RExC_seen() \
732 DEBUG_OPTIMISE_MORE_r({ \
733 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
735 if (RExC_seen & REG_ZERO_LEN_SEEN) \
736 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
738 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
739 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
741 if (RExC_seen & REG_GPOS_SEEN) \
742 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
744 if (RExC_seen & REG_CANY_SEEN) \
745 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
747 if (RExC_seen & REG_RECURSE_SEEN) \
748 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
750 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
751 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
753 if (RExC_seen & REG_VERBARG_SEEN) \
754 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
756 if (RExC_seen & REG_CUTGROUP_SEEN) \
757 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
759 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
760 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
762 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
763 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
765 if (RExC_seen & REG_GOSTART_SEEN) \
766 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
768 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
769 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
771 PerlIO_printf(Perl_debug_log,"\n"); \
774 #define DEBUG_STUDYDATA(str,data,depth) \
775 DEBUG_OPTIMISE_MORE_r(if(data){ \
776 PerlIO_printf(Perl_debug_log, \
777 "%*s" str "Pos:%"IVdf"/%"IVdf \
778 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
779 (int)(depth)*2, "", \
780 (IV)((data)->pos_min), \
781 (IV)((data)->pos_delta), \
782 (UV)((data)->flags), \
783 (IV)((data)->whilem_c), \
784 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
785 is_inf ? "INF " : "" \
787 if ((data)->last_found) \
788 PerlIO_printf(Perl_debug_log, \
789 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
790 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
791 SvPVX_const((data)->last_found), \
792 (IV)((data)->last_end), \
793 (IV)((data)->last_start_min), \
794 (IV)((data)->last_start_max), \
795 ((data)->longest && \
796 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
797 SvPVX_const((data)->longest_fixed), \
798 (IV)((data)->offset_fixed), \
799 ((data)->longest && \
800 (data)->longest==&((data)->longest_float)) ? "*" : "", \
801 SvPVX_const((data)->longest_float), \
802 (IV)((data)->offset_float_min), \
803 (IV)((data)->offset_float_max) \
805 PerlIO_printf(Perl_debug_log,"\n"); \
810 /* is c a control character for which we have a mnemonic? */
811 #define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
814 S_cntrl_to_mnemonic(const U8 c)
816 /* Returns the mnemonic string that represents character 'c', if one
817 * exists; NULL otherwise. The only ones that exist for the purposes of
818 * this routine are a few control characters */
821 case '\a': return "\\a";
822 case '\b': return "\\b";
823 case ESC_NATIVE: return "\\e";
824 case '\f': return "\\f";
825 case '\n': return "\\n";
826 case '\r': return "\\r";
827 case '\t': return "\\t";
835 /* Mark that we cannot extend a found fixed substring at this point.
836 Update the longest found anchored substring and the longest found
837 floating substrings if needed. */
840 S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
841 SSize_t *minlenp, int is_inf)
843 const STRLEN l = CHR_SVLEN(data->last_found);
844 const STRLEN old_l = CHR_SVLEN(*data->longest);
845 GET_RE_DEBUG_FLAGS_DECL;
847 PERL_ARGS_ASSERT_SCAN_COMMIT;
849 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
850 SvSetMagicSV(*data->longest, data->last_found);
851 if (*data->longest == data->longest_fixed) {
852 data->offset_fixed = l ? data->last_start_min : data->pos_min;
853 if (data->flags & SF_BEFORE_EOL)
855 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
857 data->flags &= ~SF_FIX_BEFORE_EOL;
858 data->minlen_fixed=minlenp;
859 data->lookbehind_fixed=0;
861 else { /* *data->longest == data->longest_float */
862 data->offset_float_min = l ? data->last_start_min : data->pos_min;
863 data->offset_float_max = (l
864 ? data->last_start_max
865 : (data->pos_delta == SSize_t_MAX
867 : data->pos_min + data->pos_delta));
869 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
870 data->offset_float_max = SSize_t_MAX;
871 if (data->flags & SF_BEFORE_EOL)
873 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
875 data->flags &= ~SF_FL_BEFORE_EOL;
876 data->minlen_float=minlenp;
877 data->lookbehind_float=0;
880 SvCUR_set(data->last_found, 0);
882 SV * const sv = data->last_found;
883 if (SvUTF8(sv) && SvMAGICAL(sv)) {
884 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
890 data->flags &= ~SF_BEFORE_EOL;
891 DEBUG_STUDYDATA("commit: ",data,0);
894 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
895 * list that describes which code points it matches */
898 S_ssc_anything(pTHX_ regnode_ssc *ssc)
900 /* Set the SSC 'ssc' to match an empty string or any code point */
902 PERL_ARGS_ASSERT_SSC_ANYTHING;
904 assert(is_ANYOF_SYNTHETIC(ssc));
906 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
907 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
908 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
912 S_ssc_is_anything(const regnode_ssc *ssc)
914 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
915 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
916 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
917 * in any way, so there's no point in using it */
922 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
924 assert(is_ANYOF_SYNTHETIC(ssc));
926 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
930 /* See if the list consists solely of the range 0 - Infinity */
931 invlist_iterinit(ssc->invlist);
932 ret = invlist_iternext(ssc->invlist, &start, &end)
936 invlist_iterfinish(ssc->invlist);
942 /* If e.g., both \w and \W are set, matches everything */
943 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
945 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
946 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
956 S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
958 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
959 * string, any code point, or any posix class under locale */
961 PERL_ARGS_ASSERT_SSC_INIT;
963 Zero(ssc, 1, regnode_ssc);
964 set_ANYOF_SYNTHETIC(ssc);
965 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
968 /* If any portion of the regex is to operate under locale rules,
969 * initialization includes it. The reason this isn't done for all regexes
970 * is that the optimizer was written under the assumption that locale was
971 * all-or-nothing. Given the complexity and lack of documentation in the
972 * optimizer, and that there are inadequate test cases for locale, many
973 * parts of it may not work properly, it is safest to avoid locale unless
975 if (RExC_contains_locale) {
976 ANYOF_POSIXL_SETALL(ssc);
979 ANYOF_POSIXL_ZERO(ssc);
984 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
985 const regnode_ssc *ssc)
987 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
988 * to the list of code points matched, and locale posix classes; hence does
989 * not check its flags) */
994 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
996 assert(is_ANYOF_SYNTHETIC(ssc));
998 invlist_iterinit(ssc->invlist);
999 ret = invlist_iternext(ssc->invlist, &start, &end)
1003 invlist_iterfinish(ssc->invlist);
1009 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
1017 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1018 const regnode_charclass* const node)
1020 /* Returns a mortal inversion list defining which code points are matched
1021 * by 'node', which is of type ANYOF. Handles complementing the result if
1022 * appropriate. If some code points aren't knowable at this time, the
1023 * returned list must, and will, contain every code point that is a
1026 SV* invlist = sv_2mortal(_new_invlist(0));
1027 SV* only_utf8_locale_invlist = NULL;
1029 const U32 n = ARG(node);
1030 bool new_node_has_latin1 = FALSE;
1032 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1034 /* Look at the data structure created by S_set_ANYOF_arg() */
1035 if (n != ANYOF_ONLY_HAS_BITMAP) {
1036 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1037 AV * const av = MUTABLE_AV(SvRV(rv));
1038 SV **const ary = AvARRAY(av);
1039 assert(RExC_rxi->data->what[n] == 's');
1041 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1042 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1044 else if (ary[0] && ary[0] != &PL_sv_undef) {
1046 /* Here, no compile-time swash, and there are things that won't be
1047 * known until runtime -- we have to assume it could be anything */
1048 return _add_range_to_invlist(invlist, 0, UV_MAX);
1050 else if (ary[3] && ary[3] != &PL_sv_undef) {
1052 /* Here no compile-time swash, and no run-time only data. Use the
1053 * node's inversion list */
1054 invlist = sv_2mortal(invlist_clone(ary[3]));
1057 /* Get the code points valid only under UTF-8 locales */
1058 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1059 && ary[2] && ary[2] != &PL_sv_undef)
1061 only_utf8_locale_invlist = ary[2];
1065 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1066 * code points, and an inversion list for the others, but if there are code
1067 * points that should match only conditionally on the target string being
1068 * UTF-8, those are placed in the inversion list, and not the bitmap.
1069 * Since there are circumstances under which they could match, they are
1070 * included in the SSC. But if the ANYOF node is to be inverted, we have
1071 * to exclude them here, so that when we invert below, the end result
1072 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1073 * have to do this here before we add the unconditionally matched code
1075 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1076 _invlist_intersection_complement_2nd(invlist,
1081 /* Add in the points from the bit map */
1082 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1083 if (ANYOF_BITMAP_TEST(node, i)) {
1084 invlist = add_cp_to_invlist(invlist, i);
1085 new_node_has_latin1 = TRUE;
1089 /* If this can match all upper Latin1 code points, have to add them
1091 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
1092 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1095 /* Similarly for these */
1096 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
1097 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
1100 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1101 _invlist_invert(invlist);
1103 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1105 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1106 * locale. We can skip this if there are no 0-255 at all. */
1107 _invlist_union(invlist, PL_Latin1, &invlist);
1110 /* Similarly add the UTF-8 locale possible matches. These have to be
1111 * deferred until after the non-UTF-8 locale ones are taken care of just
1112 * above, or it leads to wrong results under ANYOF_INVERT */
1113 if (only_utf8_locale_invlist) {
1114 _invlist_union_maybe_complement_2nd(invlist,
1115 only_utf8_locale_invlist,
1116 ANYOF_FLAGS(node) & ANYOF_INVERT,
1123 /* These two functions currently do the exact same thing */
1124 #define ssc_init_zero ssc_init
1126 #define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1127 #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1129 /* 'AND' a given class with another one. Can create false positives. 'ssc'
1130 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1131 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
1134 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1135 const regnode_charclass *and_with)
1137 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1138 * another SSC or a regular ANYOF class. Can create false positives. */
1143 PERL_ARGS_ASSERT_SSC_AND;
1145 assert(is_ANYOF_SYNTHETIC(ssc));
1147 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1148 * the code point inversion list and just the relevant flags */
1149 if (is_ANYOF_SYNTHETIC(and_with)) {
1150 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
1151 anded_flags = ANYOF_FLAGS(and_with);
1153 /* XXX This is a kludge around what appears to be deficiencies in the
1154 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1155 * there are paths through the optimizer where it doesn't get weeded
1156 * out when it should. And if we don't make some extra provision for
1157 * it like the code just below, it doesn't get added when it should.
1158 * This solution is to add it only when AND'ing, which is here, and
1159 * only when what is being AND'ed is the pristine, original node
1160 * matching anything. Thus it is like adding it to ssc_anything() but
1161 * only when the result is to be AND'ed. Probably the same solution
1162 * could be adopted for the same problem we have with /l matching,
1163 * which is solved differently in S_ssc_init(), and that would lead to
1164 * fewer false positives than that solution has. But if this solution
1165 * creates bugs, the consequences are only that a warning isn't raised
1166 * that should be; while the consequences for having /l bugs is
1167 * incorrect matches */
1168 if (ssc_is_anything((regnode_ssc *)and_with)) {
1169 anded_flags |= ANYOF_WARN_SUPER;
1173 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
1174 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1177 ANYOF_FLAGS(ssc) &= anded_flags;
1179 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1180 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1181 * 'and_with' may be inverted. When not inverted, we have the situation of
1183 * (C1 | P1) & (C2 | P2)
1184 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1185 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1186 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1187 * <= ((C1 & C2) | P1 | P2)
1188 * Alternatively, the last few steps could be:
1189 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1190 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1191 * <= (C1 | C2 | (P1 & P2))
1192 * We favor the second approach if either P1 or P2 is non-empty. This is
1193 * because these components are a barrier to doing optimizations, as what
1194 * they match cannot be known until the moment of matching as they are
1195 * dependent on the current locale, 'AND"ing them likely will reduce or
1197 * But we can do better if we know that C1,P1 are in their initial state (a
1198 * frequent occurrence), each matching everything:
1199 * (<everything>) & (C2 | P2) = C2 | P2
1200 * Similarly, if C2,P2 are in their initial state (again a frequent
1201 * occurrence), the result is a no-op
1202 * (C1 | P1) & (<everything>) = C1 | P1
1205 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1206 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1207 * <= (C1 & ~C2) | (P1 & ~P2)
1210 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1211 && ! is_ANYOF_SYNTHETIC(and_with))
1215 ssc_intersection(ssc,
1217 FALSE /* Has already been inverted */
1220 /* If either P1 or P2 is empty, the intersection will be also; can skip
1222 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
1223 ANYOF_POSIXL_ZERO(ssc);
1225 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1227 /* Note that the Posix class component P from 'and_with' actually
1229 * P = Pa | Pb | ... | Pn
1230 * where each component is one posix class, such as in [\w\s].
1232 * ~P = ~(Pa | Pb | ... | Pn)
1233 * = ~Pa & ~Pb & ... & ~Pn
1234 * <= ~Pa | ~Pb | ... | ~Pn
1235 * The last is something we can easily calculate, but unfortunately
1236 * is likely to have many false positives. We could do better
1237 * in some (but certainly not all) instances if two classes in
1238 * P have known relationships. For example
1239 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1241 * :lower: & :print: = :lower:
1242 * And similarly for classes that must be disjoint. For example,
1243 * since \s and \w can have no elements in common based on rules in
1244 * the POSIX standard,
1245 * \w & ^\S = nothing
1246 * Unfortunately, some vendor locales do not meet the Posix
1247 * standard, in particular almost everything by Microsoft.
1248 * The loop below just changes e.g., \w into \W and vice versa */
1250 regnode_charclass_posixl temp;
1251 int add = 1; /* To calculate the index of the complement */
1253 ANYOF_POSIXL_ZERO(&temp);
1254 for (i = 0; i < ANYOF_MAX; i++) {
1256 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1257 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
1259 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
1260 ANYOF_POSIXL_SET(&temp, i + add);
1262 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1264 ANYOF_POSIXL_AND(&temp, ssc);
1266 } /* else ssc already has no posixes */
1267 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1268 in its initial state */
1269 else if (! is_ANYOF_SYNTHETIC(and_with)
1270 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
1272 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1273 * copy it over 'ssc' */
1274 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1275 if (is_ANYOF_SYNTHETIC(and_with)) {
1276 StructCopy(and_with, ssc, regnode_ssc);
1279 ssc->invlist = anded_cp_list;
1280 ANYOF_POSIXL_ZERO(ssc);
1281 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1282 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
1286 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
1287 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
1289 /* One or the other of P1, P2 is non-empty. */
1290 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1291 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1293 ssc_union(ssc, anded_cp_list, FALSE);
1295 else { /* P1 = P2 = empty */
1296 ssc_intersection(ssc, anded_cp_list, FALSE);
1302 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1303 const regnode_charclass *or_with)
1305 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1306 * another SSC or a regular ANYOF class. Can create false positives if
1307 * 'or_with' is to be inverted. */
1312 PERL_ARGS_ASSERT_SSC_OR;
1314 assert(is_ANYOF_SYNTHETIC(ssc));
1316 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1317 * the code point inversion list and just the relevant flags */
1318 if (is_ANYOF_SYNTHETIC(or_with)) {
1319 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
1320 ored_flags = ANYOF_FLAGS(or_with);
1323 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
1324 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
1327 ANYOF_FLAGS(ssc) |= ored_flags;
1329 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1330 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1331 * 'or_with' may be inverted. When not inverted, we have the simple
1332 * situation of computing:
1333 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1334 * If P1|P2 yields a situation with both a class and its complement are
1335 * set, like having both \w and \W, this matches all code points, and we
1336 * can delete these from the P component of the ssc going forward. XXX We
1337 * might be able to delete all the P components, but I (khw) am not certain
1338 * about this, and it is better to be safe.
1341 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1342 * <= (C1 | P1) | ~C2
1343 * <= (C1 | ~C2) | P1
1344 * (which results in actually simpler code than the non-inverted case)
1347 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1348 && ! is_ANYOF_SYNTHETIC(or_with))
1350 /* We ignore P2, leaving P1 going forward */
1351 } /* else Not inverted */
1352 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
1353 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
1354 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1356 for (i = 0; i < ANYOF_MAX; i += 2) {
1357 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1359 ssc_match_all_cp(ssc);
1360 ANYOF_POSIXL_CLEAR(ssc, i);
1361 ANYOF_POSIXL_CLEAR(ssc, i+1);
1369 FALSE /* Already has been inverted */
1373 PERL_STATIC_INLINE void
1374 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1376 PERL_ARGS_ASSERT_SSC_UNION;
1378 assert(is_ANYOF_SYNTHETIC(ssc));
1380 _invlist_union_maybe_complement_2nd(ssc->invlist,
1386 PERL_STATIC_INLINE void
1387 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1389 const bool invert2nd)
1391 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1393 assert(is_ANYOF_SYNTHETIC(ssc));
1395 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1401 PERL_STATIC_INLINE void
1402 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1404 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1406 assert(is_ANYOF_SYNTHETIC(ssc));
1408 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1411 PERL_STATIC_INLINE void
1412 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1414 /* AND just the single code point 'cp' into the SSC 'ssc' */
1416 SV* cp_list = _new_invlist(2);
1418 PERL_ARGS_ASSERT_SSC_CP_AND;
1420 assert(is_ANYOF_SYNTHETIC(ssc));
1422 cp_list = add_cp_to_invlist(cp_list, cp);
1423 ssc_intersection(ssc, cp_list,
1424 FALSE /* Not inverted */
1426 SvREFCNT_dec_NN(cp_list);
1429 PERL_STATIC_INLINE void
1430 S_ssc_clear_locale(regnode_ssc *ssc)
1432 /* Set the SSC 'ssc' to not match any locale things */
1433 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1435 assert(is_ANYOF_SYNTHETIC(ssc));
1437 ANYOF_POSIXL_ZERO(ssc);
1438 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1442 S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1444 /* The inversion list in the SSC is marked mortal; now we need a more
1445 * permanent copy, which is stored the same way that is done in a regular
1446 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1449 SV* invlist = invlist_clone(ssc->invlist);
1451 PERL_ARGS_ASSERT_SSC_FINALIZE;
1453 assert(is_ANYOF_SYNTHETIC(ssc));
1455 /* The code in this file assumes that all but these flags aren't relevant
1456 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1457 * by the time we reach here */
1458 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
1460 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1462 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1463 NULL, NULL, NULL, FALSE);
1465 /* Make sure is clone-safe */
1466 ssc->invlist = NULL;
1468 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
1469 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
1472 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1475 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1476 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1477 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1478 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1479 ? (TRIE_LIST_CUR( idx ) - 1) \
1485 dump_trie(trie,widecharmap,revcharmap)
1486 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1487 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
1489 These routines dump out a trie in a somewhat readable format.
1490 The _interim_ variants are used for debugging the interim
1491 tables that are used to generate the final compressed
1492 representation which is what dump_trie expects.
1494 Part of the reason for their existence is to provide a form
1495 of documentation as to how the different representations function.
1500 Dumps the final compressed table form of the trie to Perl_debug_log.
1501 Used for debugging make_trie().
1505 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1506 AV *revcharmap, U32 depth)
1509 SV *sv=sv_newmortal();
1510 int colwidth= widecharmap ? 6 : 4;
1512 GET_RE_DEBUG_FLAGS_DECL;
1514 PERL_ARGS_ASSERT_DUMP_TRIE;
1516 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1517 (int)depth * 2 + 2,"",
1518 "Match","Base","Ofs" );
1520 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
1521 SV ** const tmp = av_fetch( revcharmap, state, 0);
1523 PerlIO_printf( Perl_debug_log, "%*s",
1525 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1526 PL_colors[0], PL_colors[1],
1527 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1528 PERL_PV_ESCAPE_FIRSTCHAR
1533 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1534 (int)depth * 2 + 2,"");
1536 for( state = 0 ; state < trie->uniquecharcount ; state++ )
1537 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
1538 PerlIO_printf( Perl_debug_log, "\n");
1540 for( state = 1 ; state < trie->statecount ; state++ ) {
1541 const U32 base = trie->states[ state ].trans.base;
1543 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1544 (int)depth * 2 + 2,"", (UV)state);
1546 if ( trie->states[ state ].wordnum ) {
1547 PerlIO_printf( Perl_debug_log, " W%4X",
1548 trie->states[ state ].wordnum );
1550 PerlIO_printf( Perl_debug_log, "%6s", "" );
1553 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1558 while( ( base + ofs < trie->uniquecharcount ) ||
1559 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1560 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1564 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1566 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1567 if ( ( base + ofs >= trie->uniquecharcount )
1568 && ( base + ofs - trie->uniquecharcount
1570 && trie->trans[ base + ofs
1571 - trie->uniquecharcount ].check == state )
1573 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1575 (UV)trie->trans[ base + ofs
1576 - trie->uniquecharcount ].next );
1578 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
1582 PerlIO_printf( Perl_debug_log, "]");
1585 PerlIO_printf( Perl_debug_log, "\n" );
1587 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1589 for (word=1; word <= trie->wordcount; word++) {
1590 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1591 (int)word, (int)(trie->wordinfo[word].prev),
1592 (int)(trie->wordinfo[word].len));
1594 PerlIO_printf(Perl_debug_log, "\n" );
1597 Dumps a fully constructed but uncompressed trie in list form.
1598 List tries normally only are used for construction when the number of
1599 possible chars (trie->uniquecharcount) is very high.
1600 Used for debugging make_trie().
1603 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
1604 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1608 SV *sv=sv_newmortal();
1609 int colwidth= widecharmap ? 6 : 4;
1610 GET_RE_DEBUG_FLAGS_DECL;
1612 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1614 /* print out the table precompression. */
1615 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1616 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1617 "------:-----+-----------------\n" );
1619 for( state=1 ; state < next_alloc ; state ++ ) {
1622 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
1623 (int)depth * 2 + 2,"", (UV)state );
1624 if ( ! trie->states[ state ].wordnum ) {
1625 PerlIO_printf( Perl_debug_log, "%5s| ","");
1627 PerlIO_printf( Perl_debug_log, "W%4x| ",
1628 trie->states[ state ].wordnum
1631 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
1632 SV ** const tmp = av_fetch( revcharmap,
1633 TRIE_LIST_ITEM(state,charid).forid, 0);
1635 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1637 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1639 PL_colors[0], PL_colors[1],
1640 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1641 | PERL_PV_ESCAPE_FIRSTCHAR
1643 TRIE_LIST_ITEM(state,charid).forid,
1644 (UV)TRIE_LIST_ITEM(state,charid).newstate
1647 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1648 (int)((depth * 2) + 14), "");
1651 PerlIO_printf( Perl_debug_log, "\n");
1656 Dumps a fully constructed but uncompressed trie in table form.
1657 This is the normal DFA style state transition table, with a few
1658 twists to facilitate compression later.
1659 Used for debugging make_trie().
1662 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
1663 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1668 SV *sv=sv_newmortal();
1669 int colwidth= widecharmap ? 6 : 4;
1670 GET_RE_DEBUG_FLAGS_DECL;
1672 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
1675 print out the table precompression so that we can do a visual check
1676 that they are identical.
1679 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1681 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1682 SV ** const tmp = av_fetch( revcharmap, charid, 0);
1684 PerlIO_printf( Perl_debug_log, "%*s",
1686 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
1687 PL_colors[0], PL_colors[1],
1688 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1689 PERL_PV_ESCAPE_FIRSTCHAR
1695 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1697 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
1698 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
1701 PerlIO_printf( Perl_debug_log, "\n" );
1703 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1705 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1706 (int)depth * 2 + 2,"",
1707 (UV)TRIE_NODENUM( state ) );
1709 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
1710 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1712 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1714 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
1716 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1717 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1718 (UV)trie->trans[ state ].check );
1720 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1721 (UV)trie->trans[ state ].check,
1722 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1730 /* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1731 startbranch: the first branch in the whole branch sequence
1732 first : start branch of sequence of branch-exact nodes.
1733 May be the same as startbranch
1734 last : Thing following the last branch.
1735 May be the same as tail.
1736 tail : item following the branch sequence
1737 count : words in the sequence
1738 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
1739 depth : indent depth
1741 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
1743 A trie is an N'ary tree where the branches are determined by digital
1744 decomposition of the key. IE, at the root node you look up the 1st character and
1745 follow that branch repeat until you find the end of the branches. Nodes can be
1746 marked as "accepting" meaning they represent a complete word. Eg:
1750 would convert into the following structure. Numbers represent states, letters
1751 following numbers represent valid transitions on the letter from that state, if
1752 the number is in square brackets it represents an accepting state, otherwise it
1753 will be in parenthesis.
1755 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1759 (1) +-i->(6)-+-s->[7]
1761 +-s->(3)-+-h->(4)-+-e->[5]
1763 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1765 This shows that when matching against the string 'hers' we will begin at state 1
1766 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1767 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1768 is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1769 single traverse. We store a mapping from accepting to state to which word was
1770 matched, and then when we have multiple possibilities we try to complete the
1771 rest of the regex in the order in which they occured in the alternation.
1773 The only prior NFA like behaviour that would be changed by the TRIE support is
1774 the silent ignoring of duplicate alternations which are of the form:
1776 / (DUPE|DUPE) X? (?{ ... }) Y /x
1778 Thus EVAL blocks following a trie may be called a different number of times with
1779 and without the optimisation. With the optimisations dupes will be silently
1780 ignored. This inconsistent behaviour of EVAL type nodes is well established as
1781 the following demonstrates:
1783 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1785 which prints out 'word' three times, but
1787 'words'=~/(word|word|word)(?{ print $1 })S/
1789 which doesnt print it out at all. This is due to other optimisations kicking in.
1791 Example of what happens on a structural level:
1793 The regexp /(ac|ad|ab)+/ will produce the following debug output:
1795 1: CURLYM[1] {1,32767}(18)
1806 This would be optimizable with startbranch=5, first=5, last=16, tail=16
1807 and should turn into:
1809 1: CURLYM[1] {1,32767}(18)
1811 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1819 Cases where tail != last would be like /(?foo|bar)baz/:
1829 which would be optimizable with startbranch=1, first=1, last=7, tail=8
1830 and would end up looking like:
1833 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1840 d = uvchr_to_utf8_flags(d, uv, 0);
1842 is the recommended Unicode-aware way of saying
1847 #define TRIE_STORE_REVCHAR(val) \
1850 SV *zlopp = newSV(7); /* XXX: optimize me */ \
1851 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1852 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
1853 SvCUR_set(zlopp, kapow - flrbbbbb); \
1856 av_push(revcharmap, zlopp); \
1858 char ooooff = (char)val; \
1859 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1863 /* This gets the next character from the input, folding it if not already
1865 #define TRIE_READ_CHAR STMT_START { \
1868 /* if it is UTF then it is either already folded, or does not need \
1870 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
1872 else if (folder == PL_fold_latin1) { \
1873 /* This folder implies Unicode rules, which in the range expressible \
1874 * by not UTF is the lower case, with the two exceptions, one of \
1875 * which should have been taken care of before calling this */ \
1876 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1877 uvc = toLOWER_L1(*uc); \
1878 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1881 /* raw data, will be folded later if needed */ \
1889 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1890 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
1891 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1892 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
1894 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1895 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1896 TRIE_LIST_CUR( state )++; \
1899 #define TRIE_LIST_NEW(state) STMT_START { \
1900 Newxz( trie->states[ state ].trans.list, \
1901 4, reg_trie_trans_le ); \
1902 TRIE_LIST_CUR( state ) = 1; \
1903 TRIE_LIST_LEN( state ) = 4; \
1906 #define TRIE_HANDLE_WORD(state) STMT_START { \
1907 U16 dupe= trie->states[ state ].wordnum; \
1908 regnode * const noper_next = regnext( noper ); \
1911 /* store the word for dumping */ \
1913 if (OP(noper) != NOTHING) \
1914 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
1916 tmp = newSVpvn_utf8( "", 0, UTF ); \
1917 av_push( trie_words, tmp ); \
1921 trie->wordinfo[curword].prev = 0; \
1922 trie->wordinfo[curword].len = wordlen; \
1923 trie->wordinfo[curword].accept = state; \
1925 if ( noper_next < tail ) { \
1927 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1929 trie->jump[curword] = (U16)(noper_next - convert); \
1931 jumper = noper_next; \
1933 nextbranch= regnext(cur); \
1937 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1938 /* chain, so that when the bits of chain are later */\
1939 /* linked together, the dups appear in the chain */\
1940 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1941 trie->wordinfo[dupe].prev = curword; \
1943 /* we haven't inserted this word yet. */ \
1944 trie->states[ state ].wordnum = curword; \
1949 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1950 ( ( base + charid >= ucharcount \
1951 && base + charid < ubound \
1952 && state == trie->trans[ base - ucharcount + charid ].check \
1953 && trie->trans[ base - ucharcount + charid ].next ) \
1954 ? trie->trans[ base - ucharcount + charid ].next \
1955 : ( state==1 ? special : 0 ) \
1959 #define MADE_JUMP_TRIE 2
1960 #define MADE_EXACT_TRIE 4
1963 S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1964 regnode *first, regnode *last, regnode *tail,
1965 U32 word_count, U32 flags, U32 depth)
1967 /* first pass, loop through and scan words */
1968 reg_trie_data *trie;
1969 HV *widecharmap = NULL;
1970 AV *revcharmap = newAV();
1976 regnode *jumper = NULL;
1977 regnode *nextbranch = NULL;
1978 regnode *convert = NULL;
1979 U32 *prev_states; /* temp array mapping each state to previous one */
1980 /* we just use folder as a flag in utf8 */
1981 const U8 * folder = NULL;
1984 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
1985 AV *trie_words = NULL;
1986 /* along with revcharmap, this only used during construction but both are
1987 * useful during debugging so we store them in the struct when debugging.
1990 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
1991 STRLEN trie_charcount=0;
1993 SV *re_trie_maxbuff;
1994 GET_RE_DEBUG_FLAGS_DECL;
1996 PERL_ARGS_ASSERT_MAKE_TRIE;
1998 PERL_UNUSED_ARG(depth);
2005 case EXACTFU: folder = PL_fold_latin1; break;
2006 case EXACTF: folder = PL_fold; break;
2007 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
2010 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
2012 trie->startstate = 1;
2013 trie->wordcount = word_count;
2014 RExC_rxi->data->data[ data_slot ] = (void*)trie;
2015 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
2017 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2018 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2019 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2022 trie_words = newAV();
2025 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2026 assert(re_trie_maxbuff);
2027 if (!SvIOK(re_trie_maxbuff)) {
2028 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2030 DEBUG_TRIE_COMPILE_r({
2031 PerlIO_printf( Perl_debug_log,
2032 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2033 (int)depth * 2 + 2, "",
2034 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2035 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
2038 /* Find the node we are going to overwrite */
2039 if ( first == startbranch && OP( last ) != BRANCH ) {
2040 /* whole branch chain */
2043 /* branch sub-chain */
2044 convert = NEXTOPER( first );
2047 /* -- First loop and Setup --
2049 We first traverse the branches and scan each word to determine if it
2050 contains widechars, and how many unique chars there are, this is
2051 important as we have to build a table with at least as many columns as we
2054 We use an array of integers to represent the character codes 0..255
2055 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2056 the native representation of the character value as the key and IV's for
2059 *TODO* If we keep track of how many times each character is used we can
2060 remap the columns so that the table compression later on is more
2061 efficient in terms of memory by ensuring the most common value is in the
2062 middle and the least common are on the outside. IMO this would be better
2063 than a most to least common mapping as theres a decent chance the most
2064 common letter will share a node with the least common, meaning the node
2065 will not be compressible. With a middle is most common approach the worst
2066 case is when we have the least common nodes twice.
2070 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2071 regnode *noper = NEXTOPER( cur );
2072 const U8 *uc = (U8*)STRING( noper );
2073 const U8 *e = uc + STR_LEN( noper );
2075 U32 wordlen = 0; /* required init */
2076 STRLEN minchars = 0;
2077 STRLEN maxchars = 0;
2078 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2081 if (OP(noper) == NOTHING) {
2082 regnode *noper_next= regnext(noper);
2083 if (noper_next != tail && OP(noper_next) == flags) {
2085 uc= (U8*)STRING(noper);
2086 e= uc + STR_LEN(noper);
2087 trie->minlen= STR_LEN(noper);
2094 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
2095 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2096 regardless of encoding */
2097 if (OP( noper ) == EXACTFU_SS) {
2098 /* false positives are ok, so just set this */
2099 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
2102 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2104 TRIE_CHARCOUNT(trie)++;
2107 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2108 * is in effect. Under /i, this character can match itself, or
2109 * anything that folds to it. If not under /i, it can match just
2110 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2111 * all fold to k, and all are single characters. But some folds
2112 * expand to more than one character, so for example LATIN SMALL
2113 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2114 * the string beginning at 'uc' is 'ffi', it could be matched by
2115 * three characters, or just by the one ligature character. (It
2116 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2117 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2118 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2119 * match.) The trie needs to know the minimum and maximum number
2120 * of characters that could match so that it can use size alone to
2121 * quickly reject many match attempts. The max is simple: it is
2122 * the number of folded characters in this branch (since a fold is
2123 * never shorter than what folds to it. */
2127 /* And the min is equal to the max if not under /i (indicated by
2128 * 'folder' being NULL), or there are no multi-character folds. If
2129 * there is a multi-character fold, the min is incremented just
2130 * once, for the character that folds to the sequence. Each
2131 * character in the sequence needs to be added to the list below of
2132 * characters in the trie, but we count only the first towards the
2133 * min number of characters needed. This is done through the
2134 * variable 'foldlen', which is returned by the macros that look
2135 * for these sequences as the number of bytes the sequence
2136 * occupies. Each time through the loop, we decrement 'foldlen' by
2137 * how many bytes the current char occupies. Only when it reaches
2138 * 0 do we increment 'minchars' or look for another multi-character
2140 if (folder == NULL) {
2143 else if (foldlen > 0) {
2144 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
2149 /* See if *uc is the beginning of a multi-character fold. If
2150 * so, we decrement the length remaining to look at, to account
2151 * for the current character this iteration. (We can use 'uc'
2152 * instead of the fold returned by TRIE_READ_CHAR because for
2153 * non-UTF, the latin1_safe macro is smart enough to account
2154 * for all the unfolded characters, and because for UTF, the
2155 * string will already have been folded earlier in the
2156 * compilation process */
2158 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2159 foldlen -= UTF8SKIP(uc);
2162 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2167 /* The current character (and any potential folds) should be added
2168 * to the possible matching characters for this position in this
2172 U8 folded= folder[ (U8) uvc ];
2173 if ( !trie->charmap[ folded ] ) {
2174 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2175 TRIE_STORE_REVCHAR( folded );
2178 if ( !trie->charmap[ uvc ] ) {
2179 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
2180 TRIE_STORE_REVCHAR( uvc );
2183 /* store the codepoint in the bitmap, and its folded
2185 TRIE_BITMAP_SET(trie, uvc);
2187 /* store the folded codepoint */
2188 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
2191 /* store first byte of utf8 representation of
2192 variant codepoints */
2193 if (! UVCHR_IS_INVARIANT(uvc)) {
2194 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
2197 set_bit = 0; /* We've done our bit :-) */
2201 /* XXX We could come up with the list of code points that fold
2202 * to this using PL_utf8_foldclosures, except not for
2203 * multi-char folds, as there may be multiple combinations
2204 * there that could work, which needs to wait until runtime to
2205 * resolve (The comment about LIGATURE FFI above is such an
2210 widecharmap = newHV();
2212 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
2215 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
2217 if ( !SvTRUE( *svpp ) ) {
2218 sv_setiv( *svpp, ++trie->uniquecharcount );
2219 TRIE_STORE_REVCHAR(uvc);
2222 } /* end loop through characters in this branch of the trie */
2224 /* We take the min and max for this branch and combine to find the min
2225 * and max for all branches processed so far */
2226 if( cur == first ) {
2227 trie->minlen = minchars;
2228 trie->maxlen = maxchars;
2229 } else if (minchars < trie->minlen) {
2230 trie->minlen = minchars;
2231 } else if (maxchars > trie->maxlen) {
2232 trie->maxlen = maxchars;
2234 } /* end first pass */
2235 DEBUG_TRIE_COMPILE_r(
2236 PerlIO_printf( Perl_debug_log,
2237 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2238 (int)depth * 2 + 2,"",
2239 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
2240 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2241 (int)trie->minlen, (int)trie->maxlen )
2245 We now know what we are dealing with in terms of unique chars and
2246 string sizes so we can calculate how much memory a naive
2247 representation using a flat table will take. If it's over a reasonable
2248 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
2249 conservative but potentially much slower representation using an array
2252 At the end we convert both representations into the same compressed
2253 form that will be used in regexec.c for matching with. The latter
2254 is a form that cannot be used to construct with but has memory
2255 properties similar to the list form and access properties similar
2256 to the table form making it both suitable for fast searches and
2257 small enough that its feasable to store for the duration of a program.
2259 See the comment in the code where the compressed table is produced
2260 inplace from the flat tabe representation for an explanation of how
2261 the compression works.
2266 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2269 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2270 > SvIV(re_trie_maxbuff) )
2273 Second Pass -- Array Of Lists Representation
2275 Each state will be represented by a list of charid:state records
2276 (reg_trie_trans_le) the first such element holds the CUR and LEN
2277 points of the allocated array. (See defines above).
2279 We build the initial structure using the lists, and then convert
2280 it into the compressed table form which allows faster lookups
2281 (but cant be modified once converted).
2284 STRLEN transcount = 1;
2286 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2287 "%*sCompiling trie using list compiler\n",
2288 (int)depth * 2 + 2, ""));
2290 trie->states = (reg_trie_state *)
2291 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2292 sizeof(reg_trie_state) );
2296 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2298 regnode *noper = NEXTOPER( cur );
2299 U8 *uc = (U8*)STRING( noper );
2300 const U8 *e = uc + STR_LEN( noper );
2301 U32 state = 1; /* required init */
2302 U16 charid = 0; /* sanity init */
2303 U32 wordlen = 0; /* required init */
2305 if (OP(noper) == NOTHING) {
2306 regnode *noper_next= regnext(noper);
2307 if (noper_next != tail && OP(noper_next) == flags) {
2309 uc= (U8*)STRING(noper);
2310 e= uc + STR_LEN(noper);
2314 if (OP(noper) != NOTHING) {
2315 for ( ; uc < e ; uc += len ) {
2320 charid = trie->charmap[ uvc ];
2322 SV** const svpp = hv_fetch( widecharmap,
2329 charid=(U16)SvIV( *svpp );
2332 /* charid is now 0 if we dont know the char read, or
2333 * nonzero if we do */
2340 if ( !trie->states[ state ].trans.list ) {
2341 TRIE_LIST_NEW( state );
2344 check <= TRIE_LIST_USED( state );
2347 if ( TRIE_LIST_ITEM( state, check ).forid
2350 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2355 newstate = next_alloc++;
2356 prev_states[newstate] = state;
2357 TRIE_LIST_PUSH( state, charid, newstate );
2362 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2366 TRIE_HANDLE_WORD(state);
2368 } /* end second pass */
2370 /* next alloc is the NEXT state to be allocated */
2371 trie->statecount = next_alloc;
2372 trie->states = (reg_trie_state *)
2373 PerlMemShared_realloc( trie->states,
2375 * sizeof(reg_trie_state) );
2377 /* and now dump it out before we compress it */
2378 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2379 revcharmap, next_alloc,
2383 trie->trans = (reg_trie_trans *)
2384 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
2391 for( state=1 ; state < next_alloc ; state ++ ) {
2395 DEBUG_TRIE_COMPILE_MORE_r(
2396 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2400 if (trie->states[state].trans.list) {
2401 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2405 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2406 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2407 if ( forid < minid ) {
2409 } else if ( forid > maxid ) {
2413 if ( transcount < tp + maxid - minid + 1) {
2415 trie->trans = (reg_trie_trans *)
2416 PerlMemShared_realloc( trie->trans,
2418 * sizeof(reg_trie_trans) );
2419 Zero( trie->trans + (transcount / 2),
2423 base = trie->uniquecharcount + tp - minid;
2424 if ( maxid == minid ) {
2426 for ( ; zp < tp ; zp++ ) {
2427 if ( ! trie->trans[ zp ].next ) {
2428 base = trie->uniquecharcount + zp - minid;
2429 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2431 trie->trans[ zp ].check = state;
2437 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2439 trie->trans[ tp ].check = state;
2444 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
2445 const U32 tid = base
2446 - trie->uniquecharcount
2447 + TRIE_LIST_ITEM( state, idx ).forid;
2448 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2450 trie->trans[ tid ].check = state;
2452 tp += ( maxid - minid + 1 );
2454 Safefree(trie->states[ state ].trans.list);
2457 DEBUG_TRIE_COMPILE_MORE_r(
2458 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2461 trie->states[ state ].trans.base=base;
2463 trie->lasttrans = tp + 1;
2467 Second Pass -- Flat Table Representation.
2469 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2470 each. We know that we will need Charcount+1 trans at most to store
2471 the data (one row per char at worst case) So we preallocate both
2472 structures assuming worst case.
2474 We then construct the trie using only the .next slots of the entry
2477 We use the .check field of the first entry of the node temporarily
2478 to make compression both faster and easier by keeping track of how
2479 many non zero fields are in the node.
2481 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2484 There are two terms at use here: state as a TRIE_NODEIDX() which is
2485 a number representing the first entry of the node, and state as a
2486 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2487 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2488 if there are 2 entrys per node. eg:
2496 The table is internally in the right hand, idx form. However as we
2497 also have to deal with the states array which is indexed by nodenum
2498 we have to use TRIE_NODENUM() to convert.
2501 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2502 "%*sCompiling trie using table compiler\n",
2503 (int)depth * 2 + 2, ""));
2505 trie->trans = (reg_trie_trans *)
2506 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2507 * trie->uniquecharcount + 1,
2508 sizeof(reg_trie_trans) );
2509 trie->states = (reg_trie_state *)
2510 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2511 sizeof(reg_trie_state) );
2512 next_alloc = trie->uniquecharcount + 1;
2515 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2517 regnode *noper = NEXTOPER( cur );
2518 const U8 *uc = (U8*)STRING( noper );
2519 const U8 *e = uc + STR_LEN( noper );
2521 U32 state = 1; /* required init */
2523 U16 charid = 0; /* sanity init */
2524 U32 accept_state = 0; /* sanity init */
2526 U32 wordlen = 0; /* required init */
2528 if (OP(noper) == NOTHING) {
2529 regnode *noper_next= regnext(noper);
2530 if (noper_next != tail && OP(noper_next) == flags) {
2532 uc= (U8*)STRING(noper);
2533 e= uc + STR_LEN(noper);
2537 if ( OP(noper) != NOTHING ) {
2538 for ( ; uc < e ; uc += len ) {
2543 charid = trie->charmap[ uvc ];
2545 SV* const * const svpp = hv_fetch( widecharmap,
2549 charid = svpp ? (U16)SvIV(*svpp) : 0;
2553 if ( !trie->trans[ state + charid ].next ) {
2554 trie->trans[ state + charid ].next = next_alloc;
2555 trie->trans[ state ].check++;
2556 prev_states[TRIE_NODENUM(next_alloc)]
2557 = TRIE_NODENUM(state);
2558 next_alloc += trie->uniquecharcount;
2560 state = trie->trans[ state + charid ].next;
2562 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2564 /* charid is now 0 if we dont know the char read, or
2565 * nonzero if we do */
2568 accept_state = TRIE_NODENUM( state );
2569 TRIE_HANDLE_WORD(accept_state);
2571 } /* end second pass */
2573 /* and now dump it out before we compress it */
2574 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2576 next_alloc, depth+1));
2580 * Inplace compress the table.*
2582 For sparse data sets the table constructed by the trie algorithm will
2583 be mostly 0/FAIL transitions or to put it another way mostly empty.
2584 (Note that leaf nodes will not contain any transitions.)
2586 This algorithm compresses the tables by eliminating most such
2587 transitions, at the cost of a modest bit of extra work during lookup:
2589 - Each states[] entry contains a .base field which indicates the
2590 index in the state[] array wheres its transition data is stored.
2592 - If .base is 0 there are no valid transitions from that node.
2594 - If .base is nonzero then charid is added to it to find an entry in
2597 -If trans[states[state].base+charid].check!=state then the
2598 transition is taken to be a 0/Fail transition. Thus if there are fail
2599 transitions at the front of the node then the .base offset will point
2600 somewhere inside the previous nodes data (or maybe even into a node
2601 even earlier), but the .check field determines if the transition is
2605 The following process inplace converts the table to the compressed
2606 table: We first do not compress the root node 1,and mark all its
2607 .check pointers as 1 and set its .base pointer as 1 as well. This
2608 allows us to do a DFA construction from the compressed table later,
2609 and ensures that any .base pointers we calculate later are greater
2612 - We set 'pos' to indicate the first entry of the second node.
2614 - We then iterate over the columns of the node, finding the first and
2615 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2616 and set the .check pointers accordingly, and advance pos
2617 appropriately and repreat for the next node. Note that when we copy
2618 the next pointers we have to convert them from the original
2619 NODEIDX form to NODENUM form as the former is not valid post
2622 - If a node has no transitions used we mark its base as 0 and do not
2623 advance the pos pointer.
2625 - If a node only has one transition we use a second pointer into the
2626 structure to fill in allocated fail transitions from other states.
2627 This pointer is independent of the main pointer and scans forward
2628 looking for null transitions that are allocated to a state. When it
2629 finds one it writes the single transition into the "hole". If the
2630 pointer doesnt find one the single transition is appended as normal.
2632 - Once compressed we can Renew/realloc the structures to release the
2635 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2636 specifically Fig 3.47 and the associated pseudocode.
2640 const U32 laststate = TRIE_NODENUM( next_alloc );
2643 trie->statecount = laststate;
2645 for ( state = 1 ; state < laststate ; state++ ) {
2647 const U32 stateidx = TRIE_NODEIDX( state );
2648 const U32 o_used = trie->trans[ stateidx ].check;
2649 U32 used = trie->trans[ stateidx ].check;
2650 trie->trans[ stateidx ].check = 0;
2653 used && charid < trie->uniquecharcount;
2656 if ( flag || trie->trans[ stateidx + charid ].next ) {
2657 if ( trie->trans[ stateidx + charid ].next ) {
2659 for ( ; zp < pos ; zp++ ) {
2660 if ( ! trie->trans[ zp ].next ) {
2664 trie->states[ state ].trans.base
2666 + trie->uniquecharcount
2668 trie->trans[ zp ].next
2669 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2671 trie->trans[ zp ].check = state;
2672 if ( ++zp > pos ) pos = zp;
2679 trie->states[ state ].trans.base
2680 = pos + trie->uniquecharcount - charid ;
2682 trie->trans[ pos ].next
2683 = SAFE_TRIE_NODENUM(
2684 trie->trans[ stateidx + charid ].next );
2685 trie->trans[ pos ].check = state;
2690 trie->lasttrans = pos + 1;
2691 trie->states = (reg_trie_state *)
2692 PerlMemShared_realloc( trie->states, laststate
2693 * sizeof(reg_trie_state) );
2694 DEBUG_TRIE_COMPILE_MORE_r(
2695 PerlIO_printf( Perl_debug_log,
2696 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2697 (int)depth * 2 + 2,"",
2698 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2702 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2705 } /* end table compress */
2707 DEBUG_TRIE_COMPILE_MORE_r(
2708 PerlIO_printf(Perl_debug_log,
2709 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2710 (int)depth * 2 + 2, "",
2711 (UV)trie->statecount,
2712 (UV)trie->lasttrans)
2714 /* resize the trans array to remove unused space */
2715 trie->trans = (reg_trie_trans *)
2716 PerlMemShared_realloc( trie->trans, trie->lasttrans
2717 * sizeof(reg_trie_trans) );
2719 { /* Modify the program and insert the new TRIE node */
2720 U8 nodetype =(U8)(flags & 0xFF);
2724 regnode *optimize = NULL;
2725 #ifdef RE_TRACK_PATTERN_OFFSETS
2728 U32 mjd_nodelen = 0;
2729 #endif /* RE_TRACK_PATTERN_OFFSETS */
2730 #endif /* DEBUGGING */
2732 This means we convert either the first branch or the first Exact,
2733 depending on whether the thing following (in 'last') is a branch
2734 or not and whther first is the startbranch (ie is it a sub part of
2735 the alternation or is it the whole thing.)
2736 Assuming its a sub part we convert the EXACT otherwise we convert
2737 the whole branch sequence, including the first.
2739 /* Find the node we are going to overwrite */
2740 if ( first != startbranch || OP( last ) == BRANCH ) {
2741 /* branch sub-chain */
2742 NEXT_OFF( first ) = (U16)(last - first);
2743 #ifdef RE_TRACK_PATTERN_OFFSETS
2745 mjd_offset= Node_Offset((convert));
2746 mjd_nodelen= Node_Length((convert));
2749 /* whole branch chain */
2751 #ifdef RE_TRACK_PATTERN_OFFSETS
2754 const regnode *nop = NEXTOPER( convert );
2755 mjd_offset= Node_Offset((nop));
2756 mjd_nodelen= Node_Length((nop));
2760 PerlIO_printf(Perl_debug_log,
2761 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2762 (int)depth * 2 + 2, "",
2763 (UV)mjd_offset, (UV)mjd_nodelen)
2766 /* But first we check to see if there is a common prefix we can
2767 split out as an EXACT and put in front of the TRIE node. */
2768 trie->startstate= 1;
2769 if ( trie->bitmap && !widecharmap && !trie->jump ) {
2771 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
2775 const U32 base = trie->states[ state ].trans.base;
2777 if ( trie->states[state].wordnum )
2780 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
2781 if ( ( base + ofs >= trie->uniquecharcount ) &&
2782 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
2783 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2785 if ( ++count > 1 ) {
2786 SV **tmp = av_fetch( revcharmap, ofs, 0);
2787 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
2788 if ( state == 1 ) break;
2790 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2792 PerlIO_printf(Perl_debug_log,
2793 "%*sNew Start State=%"UVuf" Class: [",
2794 (int)depth * 2 + 2, "",
2797 SV ** const tmp = av_fetch( revcharmap, idx, 0);
2798 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
2800 TRIE_BITMAP_SET(trie,*ch);
2802 TRIE_BITMAP_SET(trie, folder[ *ch ]);
2804 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
2808 TRIE_BITMAP_SET(trie,*ch);
2810 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2811 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2817 SV **tmp = av_fetch( revcharmap, idx, 0);
2819 char *ch = SvPV( *tmp, len );
2821 SV *sv=sv_newmortal();
2822 PerlIO_printf( Perl_debug_log,
2823 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2824 (int)depth * 2 + 2, "",
2826 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2827 PL_colors[0], PL_colors[1],
2828 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2829 PERL_PV_ESCAPE_FIRSTCHAR
2834 OP( convert ) = nodetype;
2835 str=STRING(convert);
2838 STR_LEN(convert) += len;
2844 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
2849 trie->prefixlen = (state-1);
2851 regnode *n = convert+NODE_SZ_STR(convert);
2852 NEXT_OFF(convert) = NODE_SZ_STR(convert);
2853 trie->startstate = state;
2854 trie->minlen -= (state - 1);
2855 trie->maxlen -= (state - 1);
2857 /* At least the UNICOS C compiler choked on this
2858 * being argument to DEBUG_r(), so let's just have
2861 #ifdef PERL_EXT_RE_BUILD
2867 regnode *fix = convert;
2868 U32 word = trie->wordcount;
2870 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2871 while( ++fix < n ) {
2872 Set_Node_Offset_Length(fix, 0, 0);
2875 SV ** const tmp = av_fetch( trie_words, word, 0 );
2877 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2878 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2880 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2888 NEXT_OFF(convert) = (U16)(tail - convert);
2889 DEBUG_r(optimize= n);
2895 if ( trie->maxlen ) {
2896 NEXT_OFF( convert ) = (U16)(tail - convert);
2897 ARG_SET( convert, data_slot );
2898 /* Store the offset to the first unabsorbed branch in
2899 jump[0], which is otherwise unused by the jump logic.
2900 We use this when dumping a trie and during optimisation. */
2902 trie->jump[0] = (U16)(nextbranch - convert);
2904 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2905 * and there is a bitmap
2906 * and the first "jump target" node we found leaves enough room
2907 * then convert the TRIE node into a TRIEC node, with the bitmap
2908 * embedded inline in the opcode - this is hypothetically faster.
2910 if ( !trie->states[trie->startstate].wordnum
2912 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
2914 OP( convert ) = TRIEC;
2915 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
2916 PerlMemShared_free(trie->bitmap);
2919 OP( convert ) = TRIE;
2921 /* store the type in the flags */
2922 convert->flags = nodetype;
2926 + regarglen[ OP( convert ) ];
2928 /* XXX We really should free up the resource in trie now,
2929 as we won't use them - (which resources?) dmq */
2931 /* needed for dumping*/
2932 DEBUG_r(if (optimize) {
2933 regnode *opt = convert;
2935 while ( ++opt < optimize) {
2936 Set_Node_Offset_Length(opt,0,0);
2939 Try to clean up some of the debris left after the
2942 while( optimize < jumper ) {
2943 mjd_nodelen += Node_Length((optimize));
2944 OP( optimize ) = OPTIMIZED;
2945 Set_Node_Offset_Length(optimize,0,0);
2948 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
2950 } /* end node insert */
2951 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, convert);
2953 /* Finish populating the prev field of the wordinfo array. Walk back
2954 * from each accept state until we find another accept state, and if
2955 * so, point the first word's .prev field at the second word. If the
2956 * second already has a .prev field set, stop now. This will be the
2957 * case either if we've already processed that word's accept state,
2958 * or that state had multiple words, and the overspill words were
2959 * already linked up earlier.
2966 for (word=1; word <= trie->wordcount; word++) {
2968 if (trie->wordinfo[word].prev)
2970 state = trie->wordinfo[word].accept;
2972 state = prev_states[state];
2975 prev = trie->states[state].wordnum;
2979 trie->wordinfo[word].prev = prev;
2981 Safefree(prev_states);
2985 /* and now dump out the compressed format */
2986 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2988 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2990 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2991 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2993 SvREFCNT_dec_NN(revcharmap);
2997 : trie->startstate>1
3003 S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
3005 /* The Trie is constructed and compressed now so we can build a fail array if
3008 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3010 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3014 We find the fail state for each state in the trie, this state is the longest
3015 proper suffix of the current state's 'word' that is also a proper prefix of
3016 another word in our trie. State 1 represents the word '' and is thus the
3017 default fail state. This allows the DFA not to have to restart after its
3018 tried and failed a word at a given point, it simply continues as though it
3019 had been matching the other word in the first place.
3021 'abcdgu'=~/abcdefg|cdgu/
3022 When we get to 'd' we are still matching the first word, we would encounter
3023 'g' which would fail, which would bring us to the state representing 'd' in
3024 the second word where we would try 'g' and succeed, proceeding to match
3027 /* add a fail transition */
3028 const U32 trie_offset = ARG(source);
3029 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
3031 const U32 ucharcount = trie->uniquecharcount;
3032 const U32 numstates = trie->statecount;
3033 const U32 ubound = trie->lasttrans + ucharcount;
3037 U32 base = trie->states[ 1 ].trans.base;
3040 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
3042 GET_RE_DEBUG_FLAGS_DECL;
3044 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
3045 PERL_UNUSED_CONTEXT;
3047 PERL_UNUSED_ARG(depth);
3050 if ( OP(source) == TRIE ) {
3051 struct regnode_1 *op = (struct regnode_1 *)
3052 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3053 StructCopy(source,op,struct regnode_1);
3054 stclass = (regnode *)op;
3056 struct regnode_charclass *op = (struct regnode_charclass *)
3057 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3058 StructCopy(source,op,struct regnode_charclass);
3059 stclass = (regnode *)op;
3061 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
3063 ARG_SET( stclass, data_slot );
3064 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
3065 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3066 aho->trie=trie_offset;
3067 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3068 Copy( trie->states, aho->states, numstates, reg_trie_state );
3069 Newxz( q, numstates, U32);
3070 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
3073 /* initialize fail[0..1] to be 1 so that we always have
3074 a valid final fail state */
3075 fail[ 0 ] = fail[ 1 ] = 1;
3077 for ( charid = 0; charid < ucharcount ; charid++ ) {
3078 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3080 q[ q_write ] = newstate;
3081 /* set to point at the root */
3082 fail[ q[ q_write++ ] ]=1;
3085 while ( q_read < q_write) {
3086 const U32 cur = q[ q_read++ % numstates ];
3087 base = trie->states[ cur ].trans.base;
3089 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3090 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3092 U32 fail_state = cur;
3095 fail_state = fail[ fail_state ];
3096 fail_base = aho->states[ fail_state ].trans.base;
3097 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3099 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3100 fail[ ch_state ] = fail_state;
3101 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3103 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3105 q[ q_write++ % numstates] = ch_state;
3109 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3110 when we fail in state 1, this allows us to use the
3111 charclass scan to find a valid start char. This is based on the principle
3112 that theres a good chance the string being searched contains lots of stuff
3113 that cant be a start char.
3115 fail[ 0 ] = fail[ 1 ] = 0;
3116 DEBUG_TRIE_COMPILE_r({
3117 PerlIO_printf(Perl_debug_log,
3118 "%*sStclass Failtable (%"UVuf" states): 0",
3119 (int)(depth * 2), "", (UV)numstates
3121 for( q_read=1; q_read<numstates; q_read++ ) {
3122 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3124 PerlIO_printf(Perl_debug_log, "\n");
3127 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
3132 #define DEBUG_PEEP(str,scan,depth) \
3133 DEBUG_OPTIMISE_r({if (scan){ \
3134 SV * const mysv=sv_newmortal(); \
3135 regnode *Next = regnext(scan); \
3136 regprop(RExC_rx, mysv, scan, NULL); \
3137 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
3138 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3139 Next ? (REG_NODE_NUM(Next)) : 0 ); \
3143 /* The below joins as many adjacent EXACTish nodes as possible into a single
3144 * one. The regop may be changed if the node(s) contain certain sequences that
3145 * require special handling. The joining is only done if:
3146 * 1) there is room in the current conglomerated node to entirely contain the
3148 * 2) they are the exact same node type
3150 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
3151 * these get optimized out
3153 * If a node is to match under /i (folded), the number of characters it matches
3154 * can be different than its character length if it contains a multi-character
3155 * fold. *min_subtract is set to the total delta number of characters of the
3158 * And *unfolded_multi_char is set to indicate whether or not the node contains
3159 * an unfolded multi-char fold. This happens when whether the fold is valid or
3160 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3161 * SMALL LETTER SHARP S, as only if the target string being matched against
3162 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3163 * folding rules depend on the locale in force at runtime. (Multi-char folds
3164 * whose components are all above the Latin1 range are not run-time locale
3165 * dependent, and have already been folded by the time this function is
3168 * This is as good a place as any to discuss the design of handling these
3169 * multi-character fold sequences. It's been wrong in Perl for a very long
3170 * time. There are three code points in Unicode whose multi-character folds
3171 * were long ago discovered to mess things up. The previous designs for
3172 * dealing with these involved assigning a special node for them. This
3173 * approach doesn't always work, as evidenced by this example:
3174 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
3175 * Both sides fold to "sss", but if the pattern is parsed to create a node that
3176 * would match just the \xDF, it won't be able to handle the case where a
3177 * successful match would have to cross the node's boundary. The new approach
3178 * that hopefully generally solves the problem generates an EXACTFU_SS node
3179 * that is "sss" in this case.
3181 * It turns out that there are problems with all multi-character folds, and not
3182 * just these three. Now the code is general, for all such cases. The
3183 * approach taken is:
3184 * 1) This routine examines each EXACTFish node that could contain multi-
3185 * character folded sequences. Since a single character can fold into
3186 * such a sequence, the minimum match length for this node is less than
3187 * the number of characters in the node. This routine returns in
3188 * *min_subtract how many characters to subtract from the the actual
3189 * length of the string to get a real minimum match length; it is 0 if
3190 * there are no multi-char foldeds. This delta is used by the caller to
3191 * adjust the min length of the match, and the delta between min and max,
3192 * so that the optimizer doesn't reject these possibilities based on size
3194 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
3195 * is used for an EXACTFU node that contains at least one "ss" sequence in
3196 * it. For non-UTF-8 patterns and strings, this is the only case where
3197 * there is a possible fold length change. That means that a regular
3198 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3199 * with length changes, and so can be processed faster. regexec.c takes
3200 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3201 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3202 * known until runtime). This saves effort in regex matching. However,
3203 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3204 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3205 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3206 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
3207 * possibilities for the non-UTF8 patterns are quite simple, except for
3208 * the sharp s. All the ones that don't involve a UTF-8 target string are
3209 * members of a fold-pair, and arrays are set up for all of them so that
3210 * the other member of the pair can be found quickly. Code elsewhere in
3211 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3212 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3213 * described in the next item.
3214 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3215 * validity of the fold won't be known until runtime, and so must remain
3216 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3217 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3218 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3219 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3220 * The reason this is a problem is that the optimizer part of regexec.c
3221 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3222 * that a character in the pattern corresponds to at most a single
3223 * character in the target string. (And I do mean character, and not byte
3224 * here, unlike other parts of the documentation that have never been
3225 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3226 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3227 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3228 * nodes, violate the assumption, and they are the only instances where it
3229 * is violated. I'm reluctant to try to change the assumption, as the
3230 * code involved is impenetrable to me (khw), so instead the code here
3231 * punts. This routine examines EXACTFL nodes, and (when the pattern
3232 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3233 * boolean indicating whether or not the node contains such a fold. When
3234 * it is true, the caller sets a flag that later causes the optimizer in
3235 * this file to not set values for the floating and fixed string lengths,
3236 * and thus avoids the optimizer code in regexec.c that makes the invalid
3237 * assumption. Thus, there is no optimization based on string lengths for
3238 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3239 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3240 * assumption is wrong only in these cases is that all other non-UTF-8
3241 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3242 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3243 * EXACTF nodes because we don't know at compile time if it actually
3244 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3245 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3246 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3247 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3248 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3249 * string would require the pattern to be forced into UTF-8, the overhead
3250 * of which we want to avoid. Similarly the unfolded multi-char folds in
3251 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3254 * Similarly, the code that generates tries doesn't currently handle
3255 * not-already-folded multi-char folds, and it looks like a pain to change
3256 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3257 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3258 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3259 * using /iaa matching will be doing so almost entirely with ASCII
3260 * strings, so this should rarely be encountered in practice */
3262 #define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
3263 if (PL_regkind[OP(scan)] == EXACT) \
3264 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
3267 S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3268 UV *min_subtract, bool *unfolded_multi_char,
3269 U32 flags,regnode *val, U32 depth)
3271 /* Merge several consecutive EXACTish nodes into one. */
3272 regnode *n = regnext(scan);
3274 regnode *next = scan + NODE_SZ_STR(scan);
3278 regnode *stop = scan;
3279 GET_RE_DEBUG_FLAGS_DECL;
3281 PERL_UNUSED_ARG(depth);
3284 PERL_ARGS_ASSERT_JOIN_EXACT;
3285 #ifndef EXPERIMENTAL_INPLACESCAN
3286 PERL_UNUSED_ARG(flags);
3287 PERL_UNUSED_ARG(val);
3289 DEBUG_PEEP("join",scan,depth);
3291 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3292 * EXACT ones that are mergeable to the current one. */
3294 && (PL_regkind[OP(n)] == NOTHING
3295 || (stringok && OP(n) == OP(scan)))
3297 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3300 if (OP(n) == TAIL || n > next)
3302 if (PL_regkind[OP(n)] == NOTHING) {
3303 DEBUG_PEEP("skip:",n,depth);
3304 NEXT_OFF(scan) += NEXT_OFF(n);
3305 next = n + NODE_STEP_REGNODE;
3312 else if (stringok) {
3313 const unsigned int oldl = STR_LEN(scan);
3314 regnode * const nnext = regnext(n);
3316 /* XXX I (khw) kind of doubt that this works on platforms (should
3317 * Perl ever run on one) where U8_MAX is above 255 because of lots
3318 * of other assumptions */
3319 /* Don't join if the sum can't fit into a single node */
3320 if (oldl + STR_LEN(n) > U8_MAX)
3323 DEBUG_PEEP("merg",n,depth);
3326 NEXT_OFF(scan) += NEXT_OFF(n);
3327 STR_LEN(scan) += STR_LEN(n);
3328 next = n + NODE_SZ_STR(n);
3329 /* Now we can overwrite *n : */
3330 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3338 #ifdef EXPERIMENTAL_INPLACESCAN
3339 if (flags && !NEXT_OFF(n)) {
3340 DEBUG_PEEP("atch", val, depth);
3341 if (reg_off_by_arg[OP(n)]) {
3342 ARG_SET(n, val - n);
3345 NEXT_OFF(n) = val - n;
3353 *unfolded_multi_char = FALSE;
3355 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3356 * can now analyze for sequences of problematic code points. (Prior to
3357 * this final joining, sequences could have been split over boundaries, and
3358 * hence missed). The sequences only happen in folding, hence for any
3359 * non-EXACT EXACTish node */
3360 if (OP(scan) != EXACT) {
3361 U8* s0 = (U8*) STRING(scan);
3363 U8* s_end = s0 + STR_LEN(scan);
3365 int total_count_delta = 0; /* Total delta number of characters that
3366 multi-char folds expand to */
3368 /* One pass is made over the node's string looking for all the
3369 * possibilities. To avoid some tests in the loop, there are two main
3370 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3375 if (OP(scan) == EXACTFL) {
3378 /* An EXACTFL node would already have been changed to another
3379 * node type unless there is at least one character in it that
3380 * is problematic; likely a character whose fold definition
3381 * won't be known until runtime, and so has yet to be folded.
3382 * For all but the UTF-8 locale, folds are 1-1 in length, but
3383 * to handle the UTF-8 case, we need to create a temporary
3384 * folded copy using UTF-8 locale rules in order to analyze it.
3385 * This is because our macros that look to see if a sequence is
3386 * a multi-char fold assume everything is folded (otherwise the
3387 * tests in those macros would be too complicated and slow).
3388 * Note that here, the non-problematic folds will have already
3389 * been done, so we can just copy such characters. We actually
3390 * don't completely fold the EXACTFL string. We skip the
3391 * unfolded multi-char folds, as that would just create work
3392 * below to figure out the size they already are */
3394 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3397 STRLEN s_len = UTF8SKIP(s);
3398 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3399 Copy(s, d, s_len, U8);
3402 else if (is_FOLDS_TO_MULTI_utf8(s)) {
3403 *unfolded_multi_char = TRUE;
3404 Copy(s, d, s_len, U8);
3407 else if (isASCII(*s)) {
3408 *(d++) = toFOLD(*s);
3412 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3418 /* Point the remainder of the routine to look at our temporary
3422 } /* End of creating folded copy of EXACTFL string */
3424 /* Examine the string for a multi-character fold sequence. UTF-8
3425 * patterns have all characters pre-folded by the time this code is
3427 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3428 length sequence we are looking for is 2 */
3430 int count = 0; /* How many characters in a multi-char fold */
3431 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3432 if (! len) { /* Not a multi-char fold: get next char */
3437 /* Nodes with 'ss' require special handling, except for
3438 * EXACTFA-ish for which there is no multi-char fold to this */
3439 if (len == 2 && *s == 's' && *(s+1) == 's'
3440 && OP(scan) != EXACTFA
3441 && OP(scan) != EXACTFA_NO_TRIE)
3444 if (OP(scan) != EXACTFL) {
3445 OP(scan) = EXACTFU_SS;
3449 else { /* Here is a generic multi-char fold. */
3450 U8* multi_end = s + len;
3452 /* Count how many characters are in it. In the case of
3453 * /aa, no folds which contain ASCII code points are
3454 * allowed, so check for those, and skip if found. */
3455 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
3456 count = utf8_length(s, multi_end);
3460 while (s < multi_end) {
3463 goto next_iteration;
3473 /* The delta is how long the sequence is minus 1 (1 is how long
3474 * the character that folds to the sequence is) */
3475 total_count_delta += count - 1;
3479 /* We created a temporary folded copy of the string in EXACTFL
3480 * nodes. Therefore we need to be sure it doesn't go below zero,
3481 * as the real string could be shorter */
3482 if (OP(scan) == EXACTFL) {
3483 int total_chars = utf8_length((U8*) STRING(scan),
3484 (U8*) STRING(scan) + STR_LEN(scan));
3485 if (total_count_delta > total_chars) {
3486 total_count_delta = total_chars;
3490 *min_subtract += total_count_delta;
3493 else if (OP(scan) == EXACTFA) {
3495 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3496 * fold to the ASCII range (and there are no existing ones in the
3497 * upper latin1 range). But, as outlined in the comments preceding
3498 * this function, we need to flag any occurrences of the sharp s.
3499 * This character forbids trie formation (because of added
3502 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
3503 OP(scan) = EXACTFA_NO_TRIE;
3504 *unfolded_multi_char = TRUE;
3513 /* Non-UTF-8 pattern, not EXACTFA node. Look for the multi-char
3514 * folds that are all Latin1. As explained in the comments
3515 * preceding this function, we look also for the sharp s in EXACTF
3516 * and EXACTFL nodes; it can be in the final position. Otherwise
3517 * we can stop looking 1 byte earlier because have to find at least
3518 * two characters for a multi-fold */
3519 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
3524 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
3525 if (! len) { /* Not a multi-char fold. */
3526 if (*s == LATIN_SMALL_LETTER_SHARP_S
3527 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
3529 *unfolded_multi_char = TRUE;
3536 && isALPHA_FOLD_EQ(*s, 's')
3537 && isALPHA_FOLD_EQ(*(s+1), 's'))
3540 /* EXACTF nodes need to know that the minimum length
3541 * changed so that a sharp s in the string can match this
3542 * ss in the pattern, but they remain EXACTF nodes, as they
3543 * won't match this unless the target string is is UTF-8,
3544 * which we don't know until runtime. EXACTFL nodes can't
3545 * transform into EXACTFU nodes */
3546 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
3547 OP(scan) = EXACTFU_SS;
3551 *min_subtract += len - 1;
3558 /* Allow dumping but overwriting the collection of skipped
3559 * ops and/or strings with fake optimized ops */
3560 n = scan + NODE_SZ_STR(scan);
3568 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3572 /* REx optimizer. Converts nodes into quicker variants "in place".
3573 Finds fixed substrings. */
3575 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
3576 to the position after last scanned or to NULL. */
3578 #define INIT_AND_WITHP \
3579 assert(!and_withp); \
3580 Newx(and_withp,1, regnode_ssc); \
3581 SAVEFREEPV(and_withp)
3583 /* this is a chain of data about sub patterns we are processing that
3584 need to be handled separately/specially in study_chunk. Its so
3585 we can simulate recursion without losing state. */
3587 typedef struct scan_frame {
3588 regnode *last; /* last node to process in this frame */
3589 regnode *next; /* next node to process when last is reached */
3590 struct scan_frame *prev; /*previous frame*/
3591 U32 prev_recursed_depth;
3592 I32 stop; /* what stopparen do we use */
3597 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
3598 SSize_t *minlenp, SSize_t *deltap,
3603 regnode_ssc *and_withp,
3604 U32 flags, U32 depth)
3605 /* scanp: Start here (read-write). */
3606 /* deltap: Write maxlen-minlen here. */
3607 /* last: Stop before this one. */
3608 /* data: string data about the pattern */
3609 /* stopparen: treat close N as END */
3610 /* recursed: which subroutines have we recursed into */
3611 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
3613 /* There must be at least this number of characters to match */
3616 regnode *scan = *scanp, *next;
3618 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
3619 int is_inf_internal = 0; /* The studied chunk is infinite */
3620 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3621 scan_data_t data_fake;
3622 SV *re_trie_maxbuff = NULL;
3623 regnode *first_non_open = scan;
3624 SSize_t stopmin = SSize_t_MAX;
3625 scan_frame *frame = NULL;
3626 GET_RE_DEBUG_FLAGS_DECL;
3628 PERL_ARGS_ASSERT_STUDY_CHUNK;
3631 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3634 while (first_non_open && OP(first_non_open) == OPEN)
3635 first_non_open=regnext(first_non_open);
3640 while ( scan && OP(scan) != END && scan < last ){
3641 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3642 node length to get a real minimum (because
3643 the folded version may be shorter) */
3644 bool unfolded_multi_char = FALSE;
3645 /* Peephole optimizer: */
3646 DEBUG_OPTIMISE_MORE_r(
3648 PerlIO_printf(Perl_debug_log,
3649 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3650 ((int) depth*2), "", (long)stopparen,
3651 (unsigned long)depth, (unsigned long)recursed_depth);
3652 if (recursed_depth) {
3655 for ( j = 0 ; j < recursed_depth ; j++ ) {
3656 PerlIO_printf(Perl_debug_log,"[");
3657 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
3658 PerlIO_printf(Perl_debug_log,"%d",
3659 PAREN_TEST(RExC_study_chunk_recursed +
3660 (j * RExC_study_chunk_recursed_bytes), i)
3663 PerlIO_printf(Perl_debug_log,"]");
3666 PerlIO_printf(Perl_debug_log,"\n");
3669 DEBUG_STUDYDATA("Peep:", data, depth);
3670 DEBUG_PEEP("Peep", scan, depth);
3673 /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
3674 * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
3675 * by a different invocation of reg() -- Yves
3677 JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
3679 /* Follow the next-chain of the current node and optimize
3680 away all the NOTHINGs from it. */
3681 if (OP(scan) != CURLYX) {
3682 const int max = (reg_off_by_arg[OP(scan)]
3684 /* I32 may be smaller than U16 on CRAYs! */
3685 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3686 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3690 /* Skip NOTHING and LONGJMP. */
3691 while ((n = regnext(n))
3692 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3693 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3694 && off + noff < max)
3696 if (reg_off_by_arg[OP(scan)])
3699 NEXT_OFF(scan) = off;
3704 /* The principal pseudo-switch. Cannot be a switch, since we
3705 look into several different things. */
3706 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3707 || OP(scan) == IFTHEN) {
3708 next = regnext(scan);
3710 /* demq: the op(next)==code check is to see if we have
3711 * "branch-branch" AFAICT */
3713 if (OP(next) == code || code == IFTHEN) {
3714 /* NOTE - There is similar code to this block below for
3715 * handling TRIE nodes on a re-study. If you change stuff here
3716 * check there too. */
3717 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
3719 regnode * const startbranch=scan;
3721 if (flags & SCF_DO_SUBSTR) {
3722 /* Cannot merge strings after this. */
3723 scan_commit(pRExC_state, data, minlenp, is_inf);
3726 if (flags & SCF_DO_STCLASS)
3727 ssc_init_zero(pRExC_state, &accum);
3729 while (OP(scan) == code) {
3730 SSize_t deltanext, minnext, fake;
3732 regnode_ssc this_class;
3735 data_fake.flags = 0;
3737 data_fake.whilem_c = data->whilem_c;
3738 data_fake.last_closep = data->last_closep;
3741 data_fake.last_closep = &fake;
3743 data_fake.pos_delta = delta;
3744 next = regnext(scan);
3745 scan = NEXTOPER(scan);
3747 scan = NEXTOPER(scan);
3748 if (flags & SCF_DO_STCLASS) {
3749 ssc_init(pRExC_state, &this_class);
3750 data_fake.start_class = &this_class;
3751 f = SCF_DO_STCLASS_AND;
3753 if (flags & SCF_WHILEM_VISITED_POS)
3754 f |= SCF_WHILEM_VISITED_POS;
3756 /* we suppose the run is continuous, last=next...*/
3757 minnext = study_chunk(pRExC_state, &scan, minlenp,
3758 &deltanext, next, &data_fake, stopparen,
3759 recursed_depth, NULL, f,depth+1);
3762 if (deltanext == SSize_t_MAX) {
3763 is_inf = is_inf_internal = 1;
3765 } else if (max1 < minnext + deltanext)
3766 max1 = minnext + deltanext;
3768 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3770 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3771 if ( stopmin > minnext)
3772 stopmin = min + min1;
3773 flags &= ~SCF_DO_SUBSTR;
3775 data->flags |= SCF_SEEN_ACCEPT;
3778 if (data_fake.flags & SF_HAS_EVAL)
3779 data->flags |= SF_HAS_EVAL;
3780 data->whilem_c = data_fake.whilem_c;
3782 if (flags & SCF_DO_STCLASS)
3783 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
3785 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3787 if (flags & SCF_DO_SUBSTR) {
3788 data->pos_min += min1;
3789 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3790 data->pos_delta = SSize_t_MAX;
3792 data->pos_delta += max1 - min1;
3793 if (max1 != min1 || is_inf)
3794 data->longest = &(data->longest_float);
3797 if (delta == SSize_t_MAX
3798 || SSize_t_MAX - delta - (max1 - min1) < 0)
3799 delta = SSize_t_MAX;
3801 delta += max1 - min1;
3802 if (flags & SCF_DO_STCLASS_OR) {
3803 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
3805 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3806 flags &= ~SCF_DO_STCLASS;
3809 else if (flags & SCF_DO_STCLASS_AND) {
3811 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3812 flags &= ~SCF_DO_STCLASS;
3815 /* Switch to OR mode: cache the old value of
3816 * data->start_class */
3818 StructCopy(data->start_class, and_withp, regnode_ssc);
3819 flags &= ~SCF_DO_STCLASS_AND;
3820 StructCopy(&accum, data->start_class, regnode_ssc);
3821 flags |= SCF_DO_STCLASS_OR;
3825 if (PERL_ENABLE_TRIE_OPTIMISATION &&
3826 OP( startbranch ) == BRANCH )
3830 Assuming this was/is a branch we are dealing with: 'scan'
3831 now points at the item that follows the branch sequence,
3832 whatever it is. We now start at the beginning of the
3833 sequence and look for subsequences of
3839 which would be constructed from a pattern like
3842 If we can find such a subsequence we need to turn the first
3843 element into a trie and then add the subsequent branch exact
3844 strings to the trie.
3848 1. patterns where the whole set of branches can be
3851 2. patterns where only a subset can be converted.
3853 In case 1 we can replace the whole set with a single regop
3854 for the trie. In case 2 we need to keep the start and end
3857 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3858 becomes BRANCH TRIE; BRANCH X;
3860 There is an additional case, that being where there is a
3861 common prefix, which gets split out into an EXACT like node
3862 preceding the TRIE node.
3864 If x(1..n)==tail then we can do a simple trie, if not we make
3865 a "jump" trie, such that when we match the appropriate word
3866 we "jump" to the appropriate tail node. Essentially we turn
3867 a nested if into a case structure of sorts.
3872 if (!re_trie_maxbuff) {
3873 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3874 if (!SvIOK(re_trie_maxbuff))
3875 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3877 if ( SvIV(re_trie_maxbuff)>=0 ) {
3879 regnode *first = (regnode *)NULL;
3880 regnode *last = (regnode *)NULL;
3881 regnode *tail = scan;
3886 SV * const mysv = sv_newmortal(); /* for dumping */
3888 /* var tail is used because there may be a TAIL
3889 regop in the way. Ie, the exacts will point to the
3890 thing following the TAIL, but the last branch will
3891 point at the TAIL. So we advance tail. If we
3892 have nested (?:) we may have to move through several
3896 while ( OP( tail ) == TAIL ) {
3897 /* this is the TAIL generated by (?:) */
3898 tail = regnext( tail );
3902 DEBUG_TRIE_COMPILE_r({
3903 regprop(RExC_rx, mysv, tail, NULL);
3904 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3905 (int)depth * 2 + 2, "",
3906 "Looking for TRIE'able sequences. Tail node is: ",
3907 SvPV_nolen_const( mysv )
3913 Step through the branches
3914 cur represents each branch,
3915 noper is the first thing to be matched as part
3917 noper_next is the regnext() of that node.
3919 We normally handle a case like this
3920 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
3921 support building with NOJUMPTRIE, which restricts
3922 the trie logic to structures like /FOO|BAR/.
3924 If noper is a trieable nodetype then the branch is
3925 a possible optimization target. If we are building
3926 under NOJUMPTRIE then we require that noper_next is
3927 the same as scan (our current position in the regex
3930 Once we have two or more consecutive such branches
3931 we can create a trie of the EXACT's contents and
3932 stitch it in place into the program.
3934 If the sequence represents all of the branches in
3935 the alternation we replace the entire thing with a
3938 Otherwise when it is a subsequence we need to
3939 stitch it in place and replace only the relevant
3940 branches. This means the first branch has to remain
3941 as it is used by the alternation logic, and its
3942 next pointer, and needs to be repointed at the item
3943 on the branch chain following the last branch we
3944 have optimized away.
3946 This could be either a BRANCH, in which case the
3947 subsequence is internal, or it could be the item
3948 following the branch sequence in which case the
3949 subsequence is at the end (which does not
3950 necessarily mean the first node is the start of the
3953 TRIE_TYPE(X) is a define which maps the optype to a
3957 ----------------+-----------
3961 EXACTFU_SS | EXACTFU
3966 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3967 ( EXACT == (X) ) ? EXACT : \
3968 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
3969 ( EXACTFA == (X) ) ? EXACTFA : \
3972 /* dont use tail as the end marker for this traverse */
3973 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3974 regnode * const noper = NEXTOPER( cur );
3975 U8 noper_type = OP( noper );
3976 U8 noper_trietype = TRIE_TYPE( noper_type );
3977 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
3978 regnode * const noper_next = regnext( noper );
3979 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3980 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
3983 DEBUG_TRIE_COMPILE_r({
3984 regprop(RExC_rx, mysv, cur, NULL);
3985 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3986 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3988 regprop(RExC_rx, mysv, noper, NULL);
3989 PerlIO_printf( Perl_debug_log, " -> %s",
3990 SvPV_nolen_const(mysv));
3993 regprop(RExC_rx, mysv, noper_next, NULL);
3994 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3995 SvPV_nolen_const(mysv));
3997 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3998 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3999 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
4003 /* Is noper a trieable nodetype that can be merged
4004 * with the current trie (if there is one)? */
4008 ( noper_trietype == NOTHING)
4009 || ( trietype == NOTHING )
4010 || ( trietype == noper_trietype )
4013 && noper_next == tail
4017 /* Handle mergable triable node Either we are
4018 * the first node in a new trieable sequence,
4019 * in which case we do some bookkeeping,
4020 * otherwise we update the end pointer. */
4023 if ( noper_trietype == NOTHING ) {
4024 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
4025 regnode * const noper_next = regnext( noper );
4026 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
4027 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
4030 if ( noper_next_trietype ) {
4031 trietype = noper_next_trietype;
4032 } else if (noper_next_type) {
4033 /* a NOTHING regop is 1 regop wide.
4034 * We need at least two for a trie
4035 * so we can't merge this in */
4039 trietype = noper_trietype;
4042 if ( trietype == NOTHING )
4043 trietype = noper_trietype;
4048 } /* end handle mergable triable node */
4050 /* handle unmergable node -
4051 * noper may either be a triable node which can
4052 * not be tried together with the current trie,
4053 * or a non triable node */
4055 /* If last is set and trietype is not
4056 * NOTHING then we have found at least two
4057 * triable branch sequences in a row of a
4058 * similar trietype so we can turn them
4059 * into a trie. If/when we allow NOTHING to
4060 * start a trie sequence this condition
4061 * will be required, and it isn't expensive
4062 * so we leave it in for now. */
4063 if ( trietype && trietype != NOTHING )
4064 make_trie( pRExC_state,
4065 startbranch, first, cur, tail,
4066 count, trietype, depth+1 );
4067 last = NULL; /* note: we clear/update
4068 first, trietype etc below,
4069 so we dont do it here */
4073 && noper_next == tail
4076 /* noper is triable, so we can start a new
4080 trietype = noper_trietype;
4082 /* if we already saw a first but the
4083 * current node is not triable then we have
4084 * to reset the first information. */
4089 } /* end handle unmergable node */
4090 } /* loop over branches */
4091 DEBUG_TRIE_COMPILE_r({
4092 regprop(RExC_rx, mysv, cur, NULL);
4093 PerlIO_printf( Perl_debug_log,
4094 "%*s- %s (%d) <SCAN FINISHED>\n",
4096 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4099 if ( last && trietype ) {
4100 if ( trietype != NOTHING ) {
4101 /* the last branch of the sequence was part of
4102 * a trie, so we have to construct it here
4103 * outside of the loop */
4104 made= make_trie( pRExC_state, startbranch,
4105 first, scan, tail, count,
4106 trietype, depth+1 );
4107 #ifdef TRIE_STUDY_OPT
4108 if ( ((made == MADE_EXACT_TRIE &&
4109 startbranch == first)
4110 || ( first_non_open == first )) &&
4112 flags |= SCF_TRIE_RESTUDY;
4113 if ( startbranch == first
4116 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
4121 /* at this point we know whatever we have is a
4122 * NOTHING sequence/branch AND if 'startbranch'
4123 * is 'first' then we can turn the whole thing
4126 if ( startbranch == first ) {
4128 /* the entire thing is a NOTHING sequence,
4129 * something like this: (?:|) So we can
4130 * turn it into a plain NOTHING op. */
4131 DEBUG_TRIE_COMPILE_r({
4132 regprop(RExC_rx, mysv, cur, NULL);
4133 PerlIO_printf( Perl_debug_log,
4134 "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
4135 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
4138 OP(startbranch)= NOTHING;
4139 NEXT_OFF(startbranch)= tail - startbranch;
4140 for ( opt= startbranch + 1; opt < tail ; opt++ )
4144 } /* end if ( last) */
4145 } /* TRIE_MAXBUF is non zero */
4150 else if ( code == BRANCHJ ) { /* single branch is optimized. */
4151 scan = NEXTOPER(NEXTOPER(scan));
4152 } else /* single branch is optimized. */
4153 scan = NEXTOPER(scan);
4155 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
4156 scan_frame *newframe = NULL;
4160 U32 my_recursed_depth= recursed_depth;
4162 if (OP(scan) != SUSPEND) {
4163 /* set the pointer */
4164 if (OP(scan) == GOSUB) {
4166 RExC_recurse[ARG2L(scan)] = scan;
4167 start = RExC_open_parens[paren-1];
4168 end = RExC_close_parens[paren-1];
4171 start = RExC_rxi->program + 1;
4176 !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren)
4178 if (!recursed_depth) {
4179 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
4181 Copy(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes),
4182 RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes),
4183 RExC_study_chunk_recursed_bytes, U8);
4185 /* we havent recursed into this paren yet, so recurse into it */
4186 DEBUG_STUDYDATA("set:", data,depth);
4187 PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
4188 my_recursed_depth= recursed_depth + 1;
4189 Newx(newframe,1,scan_frame);
4191 DEBUG_STUDYDATA("inf:", data,depth);
4192 /* some form of infinite recursion, assume infinite length
4194 if (flags & SCF_DO_SUBSTR) {
4195 scan_commit(pRExC_state, data, minlenp, is_inf);
4196 data->longest = &(data->longest_float);
4198 is_inf = is_inf_internal = 1;
4199 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4200 ssc_anything(data->start_class);
4201 flags &= ~SCF_DO_STCLASS;
4204 Newx(newframe,1,scan_frame);
4207 end = regnext(scan);
4212 SAVEFREEPV(newframe);
4213 newframe->next = regnext(scan);
4214 newframe->last = last;
4215 newframe->stop = stopparen;
4216 newframe->prev = frame;
4217 newframe->prev_recursed_depth = recursed_depth;
4219 DEBUG_STUDYDATA("frame-new:",data,depth);
4220 DEBUG_PEEP("fnew", scan, depth);
4227 recursed_depth= my_recursed_depth;
4232 else if (OP(scan) == EXACT) {
4233 SSize_t l = STR_LEN(scan);
4236 const U8 * const s = (U8*)STRING(scan);
4237 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4238 l = utf8_length(s, s + l);
4240 uc = *((U8*)STRING(scan));
4243 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
4244 /* The code below prefers earlier match for fixed
4245 offset, later match for variable offset. */
4246 if (data->last_end == -1) { /* Update the start info. */
4247 data->last_start_min = data->pos_min;
4248 data->last_start_max = is_inf
4249 ? SSize_t_MAX : data->pos_min + data->pos_delta;
4251 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
4253 SvUTF8_on(data->last_found);
4255 SV * const sv = data->last_found;
4256 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4258 if (mg && mg->mg_len >= 0)
4259 mg->mg_len += utf8_length((U8*)STRING(scan),
4260 (U8*)STRING(scan)+STR_LEN(scan));
4262 data->last_end = data->pos_min + l;
4263 data->pos_min += l; /* As in the first entry. */
4264 data->flags &= ~SF_BEFORE_EOL;
4267 /* ANDing the code point leaves at most it, and not in locale, and
4268 * can't match null string */
4269 if (flags & SCF_DO_STCLASS_AND) {
4270 ssc_cp_and(data->start_class, uc);
4271 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4272 ssc_clear_locale(data->start_class);
4274 else if (flags & SCF_DO_STCLASS_OR) {
4275 ssc_add_cp(data->start_class, uc);
4276 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4278 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4279 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4281 flags &= ~SCF_DO_STCLASS;
4283 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
4285 SSize_t l = STR_LEN(scan);
4286 UV uc = *((U8*)STRING(scan));
4287 SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
4288 separate code points */
4289 const U8 * s = (U8*)STRING(scan);
4291 /* Search for fixed substrings supports EXACT only. */
4292 if (flags & SCF_DO_SUBSTR) {
4294 scan_commit(pRExC_state, data, minlenp, is_inf);
4297 uc = utf8_to_uvchr_buf(s, s + l, NULL);
4298 l = utf8_length(s, s + l);
4300 if (unfolded_multi_char) {
4301 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
4303 min += l - min_subtract;
4305 delta += min_subtract;
4306 if (flags & SCF_DO_SUBSTR) {
4307 data->pos_min += l - min_subtract;
4308 if (data->pos_min < 0) {
4311 data->pos_delta += min_subtract;
4313 data->longest = &(data->longest_float);
4317 if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
4318 ssc_clear_locale(data->start_class);
4323 /* We punt and assume can match anything if the node begins
4324 * with a multi-character fold. Things are complicated. For
4325 * example, /ffi/i could match any of:
4326 * "\N{LATIN SMALL LIGATURE FFI}"
4327 * "\N{LATIN SMALL LIGATURE FF}I"
4328 * "F\N{LATIN SMALL LIGATURE FI}"
4329 * plus several other things; and making sure we have all the
4330 * possibilities is hard. */
4331 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
4333 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4337 /* Any Latin1 range character can potentially match any
4338 * other depending on the locale */
4339 if (OP(scan) == EXACTFL) {
4340 _invlist_union(EXACTF_invlist, PL_Latin1,
4344 /* But otherwise, it matches at least itself. We can
4345 * quickly tell if it has a distinct fold, and if so,
4346 * it matches that as well */
4347 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4348 if (IS_IN_SOME_FOLD_L1(uc)) {
4349 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
4350 PL_fold_latin1[uc]);
4354 /* Some characters match above-Latin1 ones under /i. This
4355 * is true of EXACTFL ones when the locale is UTF-8 */
4356 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
4357 && (! isASCII(uc) || (OP(scan) != EXACTFA
4358 && OP(scan) != EXACTFA_NO_TRIE)))
4360 add_above_Latin1_folds(pRExC_state,
4366 else { /* Pattern is UTF-8 */
4367 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
4368 STRLEN foldlen = UTF8SKIP(s);
4369 const U8* e = s + STR_LEN(scan);
4372 /* The only code points that aren't folded in a UTF EXACTFish
4373 * node are are the problematic ones in EXACTFL nodes */
4374 if (OP(scan) == EXACTFL
4375 && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
4377 /* We need to check for the possibility that this EXACTFL
4378 * node begins with a multi-char fold. Therefore we fold
4379 * the first few characters of it so that we can make that
4384 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
4386 *(d++) = (U8) toFOLD(*s);
4391 to_utf8_fold(s, d, &len);
4397 /* And set up so the code below that looks in this folded
4398 * buffer instead of the node's string */
4400 foldlen = UTF8SKIP(folded);
4404 /* When we reach here 's' points to the fold of the first
4405 * character(s) of the node; and 'e' points to far enough along
4406 * the folded string to be just past any possible multi-char
4407 * fold. 'foldlen' is the length in bytes of the first
4410 * Unlike the non-UTF-8 case, the macro for determining if a
4411 * string is a multi-char fold requires all the characters to
4412 * already be folded. This is because of all the complications
4413 * if not. Note that they are folded anyway, except in EXACTFL
4414 * nodes. Like the non-UTF case above, we punt if the node
4415 * begins with a multi-char fold */
4417 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
4419 _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
4421 else { /* Single char fold */
4423 /* It matches all the things that fold to it, which are
4424 * found in PL_utf8_foldclosures (including itself) */
4425 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
4426 if (! PL_utf8_foldclosures) {
4427 _load_PL_utf8_foldclosures();
4429 if ((listp = hv_fetch(PL_utf8_foldclosures,
4430 (char *) s, foldlen, FALSE)))
4432 AV* list = (AV*) *listp;
4434 for (k = 0; k <= av_tindex(list); k++) {
4435 SV** c_p = av_fetch(list, k, FALSE);
4441 /* /aa doesn't allow folds between ASCII and non- */
4442 if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
4443 && isASCII(c) != isASCII(uc))
4448 EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
4453 if (flags & SCF_DO_STCLASS_AND) {
4454 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4455 ANYOF_POSIXL_ZERO(data->start_class);
4456 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
4458 else if (flags & SCF_DO_STCLASS_OR) {
4459 ssc_union(data->start_class, EXACTF_invlist, FALSE);
4460 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4462 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4463 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4465 flags &= ~SCF_DO_STCLASS;
4466 SvREFCNT_dec(EXACTF_invlist);
4468 else if (REGNODE_VARIES(OP(scan))) {
4469 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
4470 I32 fl = 0, f = flags;
4471 regnode * const oscan = scan;
4472 regnode_ssc this_class;
4473 regnode_ssc *oclass = NULL;
4474 I32 next_is_eval = 0;
4476 switch (PL_regkind[OP(scan)]) {
4477 case WHILEM: /* End of (?:...)* . */
4478 scan = NEXTOPER(scan);
4481 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
4482 next = NEXTOPER(scan);
4483 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
4485 maxcount = REG_INFTY;
4486 next = regnext(scan);
4487 scan = NEXTOPER(scan);
4491 if (flags & SCF_DO_SUBSTR)
4496 if (flags & SCF_DO_STCLASS) {
4498 maxcount = REG_INFTY;
4499 next = regnext(scan);
4500 scan = NEXTOPER(scan);
4503 if (flags & SCF_DO_SUBSTR) {
4504 scan_commit(pRExC_state, data, minlenp, is_inf);
4505 /* Cannot extend fixed substrings */
4506 data->longest = &(data->longest_float);
4508 is_inf = is_inf_internal = 1;
4509 scan = regnext(scan);
4510 goto optimize_curly_tail;
4512 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
4513 && (scan->flags == stopparen))
4518 mincount = ARG1(scan);
4519 maxcount = ARG2(scan);
4521 next = regnext(scan);
4522 if (OP(scan) == CURLYX) {
4523 I32 lp = (data ? *(data->last_closep) : 0);
4524 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
4526 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
4527 next_is_eval = (OP(scan) == EVAL);
4529 if (flags & SCF_DO_SUBSTR) {
4531 scan_commit(pRExC_state, data, minlenp, is_inf);
4532 /* Cannot extend fixed substrings */
4533 pos_before = data->pos_min;
4537 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
4539 data->flags |= SF_IS_INF;
4541 if (flags & SCF_DO_STCLASS) {
4542 ssc_init(pRExC_state, &this_class);
4543 oclass = data->start_class;
4544 data->start_class = &this_class;
4545 f |= SCF_DO_STCLASS_AND;
4546 f &= ~SCF_DO_STCLASS_OR;
4548 /* Exclude from super-linear cache processing any {n,m}
4549 regops for which the combination of input pos and regex
4550 pos is not enough information to determine if a match
4553 For example, in the regex /foo(bar\s*){4,8}baz/ with the
4554 regex pos at the \s*, the prospects for a match depend not
4555 only on the input position but also on how many (bar\s*)
4556 repeats into the {4,8} we are. */
4557 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
4558 f &= ~SCF_WHILEM_VISITED_POS;
4560 /* This will finish on WHILEM, setting scan, or on NULL: */
4561 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
4562 last, data, stopparen, recursed_depth, NULL,
4564 ? (f & ~SCF_DO_SUBSTR)
4568 if (flags & SCF_DO_STCLASS)
4569 data->start_class = oclass;
4570 if (mincount == 0 || minnext == 0) {
4571 if (flags & SCF_DO_STCLASS_OR) {
4572 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4574 else if (flags & SCF_DO_STCLASS_AND) {
4575 /* Switch to OR mode: cache the old value of
4576 * data->start_class */
4578 StructCopy(data->start_class, and_withp, regnode_ssc);
4579 flags &= ~SCF_DO_STCLASS_AND;
4580 StructCopy(&this_class, data->start_class, regnode_ssc);
4581 flags |= SCF_DO_STCLASS_OR;
4582 ANYOF_FLAGS(data->start_class)
4583 |= SSC_MATCHES_EMPTY_STRING;
4585 } else { /* Non-zero len */
4586 if (flags & SCF_DO_STCLASS_OR) {
4587 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4588 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4590 else if (flags & SCF_DO_STCLASS_AND)
4591 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
4592 flags &= ~SCF_DO_STCLASS;
4594 if (!scan) /* It was not CURLYX, but CURLY. */
4596 if (!(flags & SCF_TRIE_DOING_RESTUDY)
4597 /* ? quantifier ok, except for (?{ ... }) */
4598 && (next_is_eval || !(mincount == 0 && maxcount == 1))
4599 && (minnext == 0) && (deltanext == 0)
4600 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
4601 && maxcount <= REG_INFTY/3) /* Complement check for big
4604 /* Fatal warnings may leak the regexp without this: */
4605 SAVEFREESV(RExC_rx_sv);
4606 ckWARNreg(RExC_parse,
4607 "Quantifier unexpected on zero-length expression");
4608 (void)ReREFCNT_inc(RExC_rx_sv);
4611 min += minnext * mincount;
4612 is_inf_internal |= deltanext == SSize_t_MAX
4613 || (maxcount == REG_INFTY && minnext + deltanext > 0);
4614 is_inf |= is_inf_internal;
4616 delta = SSize_t_MAX;
4618 delta += (minnext + deltanext) * maxcount
4619 - minnext * mincount;
4621 /* Try powerful optimization CURLYX => CURLYN. */
4622 if ( OP(oscan) == CURLYX && data
4623 && data->flags & SF_IN_PAR
4624 && !(data->flags & SF_HAS_EVAL)
4625 && !deltanext && minnext == 1 ) {
4626 /* Try to optimize to CURLYN. */
4627 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
4628 regnode * const nxt1 = nxt;
4635 if (!REGNODE_SIMPLE(OP(nxt))
4636 && !(PL_regkind[OP(nxt)] == EXACT
4637 && STR_LEN(nxt) == 1))
4643 if (OP(nxt) != CLOSE)
4645 if (RExC_open_parens) {
4646 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4647 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
4649 /* Now we know that nxt2 is the only contents: */
4650 oscan->flags = (U8)ARG(nxt);
4652 OP(nxt1) = NOTHING; /* was OPEN. */
4655 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4656 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
4657 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
4658 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4659 OP(nxt + 1) = OPTIMIZED; /* was count. */
4660 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
4665 /* Try optimization CURLYX => CURLYM. */
4666 if ( OP(oscan) == CURLYX && data
4667 && !(data->flags & SF_HAS_PAR)
4668 && !(data->flags & SF_HAS_EVAL)
4669 && !deltanext /* atom is fixed width */
4670 && minnext != 0 /* CURLYM can't handle zero width */
4672 /* Nor characters whose fold at run-time may be
4673 * multi-character */
4674 && ! (RExC_seen & REG_UNFOLDED_MULTI_SEEN)
4676 /* XXXX How to optimize if data == 0? */
4677 /* Optimize to a simpler form. */
4678 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
4682 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
4683 && (OP(nxt2) != WHILEM))
4685 OP(nxt2) = SUCCEED; /* Whas WHILEM */
4686 /* Need to optimize away parenths. */
4687 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
4688 /* Set the parenth number. */
4689 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
4691 oscan->flags = (U8)ARG(nxt);
4692 if (RExC_open_parens) {
4693 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
4694 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
4696 OP(nxt1) = OPTIMIZED; /* was OPEN. */
4697 OP(nxt) = OPTIMIZED; /* was CLOSE. */
4700 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
4701 OP(nxt + 1) = OPTIMIZED; /* was count. */
4702 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
4703 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
4706 while ( nxt1 && (OP(nxt1) != WHILEM)) {
4707 regnode *nnxt = regnext(nxt1);
4709 if (reg_off_by_arg[OP(nxt1)])
4710 ARG_SET(nxt1, nxt2 - nxt1);
4711 else if (nxt2 - nxt1 < U16_MAX)
4712 NEXT_OFF(nxt1) = nxt2 - nxt1;
4714 OP(nxt) = NOTHING; /* Cannot beautify */
4719 /* Optimize again: */
4720 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
4721 NULL, stopparen, recursed_depth, NULL, 0,depth+1);
4726 else if ((OP(oscan) == CURLYX)
4727 && (flags & SCF_WHILEM_VISITED_POS)
4728 /* See the comment on a similar expression above.
4729 However, this time it's not a subexpression
4730 we care about, but the expression itself. */
4731 && (maxcount == REG_INFTY)
4732 && data && ++data->whilem_c < 16) {
4733 /* This stays as CURLYX, we can put the count/of pair. */
4734 /* Find WHILEM (as in regexec.c) */
4735 regnode *nxt = oscan + NEXT_OFF(oscan);
4737 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
4739 PREVOPER(nxt)->flags = (U8)(data->whilem_c
4740 | (RExC_whilem_seen << 4)); /* On WHILEM */
4742 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
4744 if (flags & SCF_DO_SUBSTR) {
4745 SV *last_str = NULL;
4746 STRLEN last_chrs = 0;
4747 int counted = mincount != 0;
4749 if (data->last_end > 0 && mincount != 0) { /* Ends with a
4751 SSize_t b = pos_before >= data->last_start_min
4752 ? pos_before : data->last_start_min;
4754 const char * const s = SvPV_const(data->last_found, l);
4755 SSize_t old = b - data->last_start_min;
4758 old = utf8_hop((U8*)s, old) - (U8*)s;
4760 /* Get the added string: */
4761 last_str = newSVpvn_utf8(s + old, l, UTF);
4762 last_chrs = UTF ? utf8_length((U8*)(s + old),
4763 (U8*)(s + old + l)) : l;
4764 if (deltanext == 0 && pos_before == b) {
4765 /* What was added is a constant string */
4768 SvGROW(last_str, (mincount * l) + 1);
4769 repeatcpy(SvPVX(last_str) + l,
4770 SvPVX_const(last_str), l,
4772 SvCUR_set(last_str, SvCUR(last_str) * mincount);
4773 /* Add additional parts. */
4774 SvCUR_set(data->last_found,
4775 SvCUR(data->last_found) - l);
4776 sv_catsv(data->last_found, last_str);
4778 SV * sv = data->last_found;
4780 SvUTF8(sv) && SvMAGICAL(sv) ?
4781 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4782 if (mg && mg->mg_len >= 0)
4783 mg->mg_len += last_chrs * (mincount-1);
4785 last_chrs *= mincount;
4786 data->last_end += l * (mincount - 1);
4789 /* start offset must point into the last copy */
4790 data->last_start_min += minnext * (mincount - 1);
4791 data->last_start_max += is_inf ? SSize_t_MAX
4792 : (maxcount - 1) * (minnext + data->pos_delta);
4795 /* It is counted once already... */
4796 data->pos_min += minnext * (mincount - counted);
4798 PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
4799 " SSize_t_MAX=%"UVuf" minnext=%"UVuf
4800 " maxcount=%"UVuf" mincount=%"UVuf"\n",
4801 (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
4803 if (deltanext != SSize_t_MAX)
4804 PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
4805 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
4806 - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
4808 if (deltanext == SSize_t_MAX
4809 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
4810 data->pos_delta = SSize_t_MAX;
4812 data->pos_delta += - counted * deltanext +
4813 (minnext + deltanext) * maxcount - minnext * mincount;
4814 if (mincount != maxcount) {
4815 /* Cannot extend fixed substrings found inside
4817 scan_commit(pRExC_state, data, minlenp, is_inf);
4818 if (mincount && last_str) {
4819 SV * const sv = data->last_found;
4820 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
4821 mg_find(sv, PERL_MAGIC_utf8) : NULL;
4825 sv_setsv(sv, last_str);
4826 data->last_end = data->pos_min;
4827 data->last_start_min = data->pos_min - last_chrs;
4828 data->last_start_max = is_inf
4830 : data->pos_min + data->pos_delta - last_chrs;
4832 data->longest = &(data->longest_float);
4834 SvREFCNT_dec(last_str);
4836 if (data && (fl & SF_HAS_EVAL))
4837 data->flags |= SF_HAS_EVAL;
4838 optimize_curly_tail:
4839 if (OP(oscan) != CURLYX) {
4840 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
4842 NEXT_OFF(oscan) += NEXT_OFF(next);
4848 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
4853 if (flags & SCF_DO_SUBSTR) {
4854 /* Cannot expect anything... */
4855 scan_commit(pRExC_state, data, minlenp, is_inf);
4856 data->longest = &(data->longest_float);
4858 is_inf = is_inf_internal = 1;
4859 if (flags & SCF_DO_STCLASS_OR) {
4860 if (OP(scan) == CLUMP) {
4861 /* Actually is any start char, but very few code points
4862 * aren't start characters */
4863 ssc_match_all_cp(data->start_class);
4866 ssc_anything(data->start_class);
4869 flags &= ~SCF_DO_STCLASS;
4873 else if (OP(scan) == LNBREAK) {
4874 if (flags & SCF_DO_STCLASS) {
4875 if (flags & SCF_DO_STCLASS_AND) {
4876 ssc_intersection(data->start_class,
4877 PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
4878 ssc_clear_locale(data->start_class);
4879 ANYOF_FLAGS(data->start_class)
4880 &= ~SSC_MATCHES_EMPTY_STRING;
4882 else if (flags & SCF_DO_STCLASS_OR) {
4883 ssc_union(data->start_class,
4884 PL_XPosix_ptrs[_CC_VERTSPACE],
4886 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
4888 /* See commit msg for
4889 * 749e076fceedeb708a624933726e7989f2302f6a */
4890 ANYOF_FLAGS(data->start_class)
4891 &= ~SSC_MATCHES_EMPTY_STRING;
4893 flags &= ~SCF_DO_STCLASS;
4896 delta++; /* Because of the 2 char string cr-lf */
4897 if (flags & SCF_DO_SUBSTR) {
4898 /* Cannot expect anything... */
4899 scan_commit(pRExC_state, data, minlenp, is_inf);
4901 data->pos_delta += 1;
4902 data->longest = &(data->longest_float);
4905 else if (REGNODE_SIMPLE(OP(scan))) {
4907 if (flags & SCF_DO_SUBSTR) {
4908 scan_commit(pRExC_state, data, minlenp, is_inf);
4912 if (flags & SCF_DO_STCLASS) {
4914 SV* my_invlist = sv_2mortal(_new_invlist(0));
4917 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
4918 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
4920 /* Some of the logic below assumes that switching
4921 locale on will only add false positives. */
4926 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
4931 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
4932 ssc_match_all_cp(data->start_class);
4937 SV* REG_ANY_invlist = _new_invlist(2);
4938 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
4940 if (flags & SCF_DO_STCLASS_OR) {
4941 ssc_union(data->start_class,
4943 TRUE /* TRUE => invert, hence all but \n
4947 else if (flags & SCF_DO_STCLASS_AND) {
4948 ssc_intersection(data->start_class,
4950 TRUE /* TRUE => invert */
4952 ssc_clear_locale(data->start_class);
4954 SvREFCNT_dec_NN(REG_ANY_invlist);
4959 if (flags & SCF_DO_STCLASS_AND)
4960 ssc_and(pRExC_state, data->start_class,
4961 (regnode_charclass *) scan);
4963 ssc_or(pRExC_state, data->start_class,
4964 (regnode_charclass *) scan);
4972 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
4973 if (flags & SCF_DO_STCLASS_AND) {
4974 bool was_there = cBOOL(
4975 ANYOF_POSIXL_TEST(data->start_class,
4977 ANYOF_POSIXL_ZERO(data->start_class);
4978 if (was_there) { /* Do an AND */
4979 ANYOF_POSIXL_SET(data->start_class, namedclass);
4981 /* No individual code points can now match */
4982 data->start_class->invlist
4983 = sv_2mortal(_new_invlist(0));
4986 int complement = namedclass + ((invert) ? -1 : 1);
4988 assert(flags & SCF_DO_STCLASS_OR);
4990 /* If the complement of this class was already there,
4991 * the result is that they match all code points,
4992 * (\d + \D == everything). Remove the classes from
4993 * future consideration. Locale is not relevant in
4995 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
4996 ssc_match_all_cp(data->start_class);
4997 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
4998 ANYOF_POSIXL_CLEAR(data->start_class, complement);
5000 else { /* The usual case; just add this class to the
5002 ANYOF_POSIXL_SET(data->start_class, namedclass);
5007 case NPOSIXA: /* For these, we always know the exact set of
5012 if (FLAGS(scan) == _CC_ASCII) {
5013 my_invlist = PL_XPosix_ptrs[_CC_ASCII];
5016 _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
5017 PL_XPosix_ptrs[_CC_ASCII],
5028 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
5030 /* NPOSIXD matches all upper Latin1 code points unless the
5031 * target string being matched is UTF-8, which is
5032 * unknowable until match time. Since we are going to
5033 * invert, we want to get rid of all of them so that the
5034 * inversion will match all */
5035 if (OP(scan) == NPOSIXD) {
5036 _invlist_subtract(my_invlist, PL_UpperLatin1,
5042 if (flags & SCF_DO_STCLASS_AND) {
5043 ssc_intersection(data->start_class, my_invlist, invert);
5044 ssc_clear_locale(data->start_class);
5047 assert(flags & SCF_DO_STCLASS_OR);
5048 ssc_union(data->start_class, my_invlist, invert);
5051 if (flags & SCF_DO_STCLASS_OR)
5052 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5053 flags &= ~SCF_DO_STCLASS;
5056 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
5057 data->flags |= (OP(scan) == MEOL
5060 scan_commit(pRExC_state, data, minlenp, is_inf);
5063 else if ( PL_regkind[OP(scan)] == BRANCHJ
5064 /* Lookbehind, or need to calculate parens/evals/stclass: */
5065 && (scan->flags || data || (flags & SCF_DO_STCLASS))
5066 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
5068 if ( OP(scan) == UNLESSM &&
5070 OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
5071 OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
5074 regnode *upto= regnext(scan);
5076 SV * const mysv_val=sv_newmortal();
5077 DEBUG_STUDYDATA("OPFAIL",data,depth);
5079 /*DEBUG_PARSE_MSG("opfail");*/
5080 regprop(RExC_rx, mysv_val, upto, NULL);
5081 PerlIO_printf(Perl_debug_log,
5082 "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
5083 SvPV_nolen_const(mysv_val),
5084 (IV)REG_NODE_NUM(upto),
5089 NEXT_OFF(scan) = upto - scan;
5090 for (opt= scan + 1; opt < upto ; opt++)
5091 OP(opt) = OPTIMIZED;
5095 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5096 || OP(scan) == UNLESSM )
5098 /* Negative Lookahead/lookbehind
5099 In this case we can't do fixed string optimisation.
5102 SSize_t deltanext, minnext, fake = 0;
5107 data_fake.flags = 0;
5109 data_fake.whilem_c = data->whilem_c;
5110 data_fake.last_closep = data->last_closep;
5113 data_fake.last_closep = &fake;
5114 data_fake.pos_delta = delta;
5115 if ( flags & SCF_DO_STCLASS && !scan->flags
5116 && OP(scan) == IFMATCH ) { /* Lookahead */
5117 ssc_init(pRExC_state, &intrnl);
5118 data_fake.start_class = &intrnl;
5119 f |= SCF_DO_STCLASS_AND;
5121 if (flags & SCF_WHILEM_VISITED_POS)
5122 f |= SCF_WHILEM_VISITED_POS;
5123 next = regnext(scan);
5124 nscan = NEXTOPER(NEXTOPER(scan));
5125 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
5126 last, &data_fake, stopparen,
5127 recursed_depth, NULL, f, depth+1);
5130 FAIL("Variable length lookbehind not implemented");
5132 else if (minnext > (I32)U8_MAX) {
5133 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5136 scan->flags = (U8)minnext;
5139 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5141 if (data_fake.flags & SF_HAS_EVAL)
5142 data->flags |= SF_HAS_EVAL;
5143 data->whilem_c = data_fake.whilem_c;
5145 if (f & SCF_DO_STCLASS_AND) {
5146 if (flags & SCF_DO_STCLASS_OR) {
5147 /* OR before, AND after: ideally we would recurse with
5148 * data_fake to get the AND applied by study of the
5149 * remainder of the pattern, and then derecurse;
5150 * *** HACK *** for now just treat as "no information".
5151 * See [perl #56690].
5153 ssc_init(pRExC_state, data->start_class);
5155 /* AND before and after: combine and continue. These
5156 * assertions are zero-length, so can match an EMPTY
5158 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5159 ANYOF_FLAGS(data->start_class)
5160 |= SSC_MATCHES_EMPTY_STRING;
5164 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
5166 /* Positive Lookahead/lookbehind
5167 In this case we can do fixed string optimisation,
5168 but we must be careful about it. Note in the case of
5169 lookbehind the positions will be offset by the minimum
5170 length of the pattern, something we won't know about
5171 until after the recurse.
5173 SSize_t deltanext, fake = 0;
5177 /* We use SAVEFREEPV so that when the full compile
5178 is finished perl will clean up the allocated
5179 minlens when it's all done. This way we don't
5180 have to worry about freeing them when we know
5181 they wont be used, which would be a pain.
5184 Newx( minnextp, 1, SSize_t );
5185 SAVEFREEPV(minnextp);
5188 StructCopy(data, &data_fake, scan_data_t);
5189 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
5192 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
5193 data_fake.last_found=newSVsv(data->last_found);
5197 data_fake.last_closep = &fake;
5198 data_fake.flags = 0;
5199 data_fake.pos_delta = delta;
5201 data_fake.flags |= SF_IS_INF;
5202 if ( flags & SCF_DO_STCLASS && !scan->flags
5203 && OP(scan) == IFMATCH ) { /* Lookahead */
5204 ssc_init(pRExC_state, &intrnl);
5205 data_fake.start_class = &intrnl;
5206 f |= SCF_DO_STCLASS_AND;
5208 if (flags & SCF_WHILEM_VISITED_POS)
5209 f |= SCF_WHILEM_VISITED_POS;
5210 next = regnext(scan);
5211 nscan = NEXTOPER(NEXTOPER(scan));
5213 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
5214 &deltanext, last, &data_fake,
5215 stopparen, recursed_depth, NULL,
5219 FAIL("Variable length lookbehind not implemented");
5221 else if (*minnextp > (I32)U8_MAX) {
5222 FAIL2("Lookbehind longer than %"UVuf" not implemented",
5225 scan->flags = (U8)*minnextp;
5230 if (f & SCF_DO_STCLASS_AND) {
5231 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
5232 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
5235 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5237 if (data_fake.flags & SF_HAS_EVAL)
5238 data->flags |= SF_HAS_EVAL;
5239 data->whilem_c = data_fake.whilem_c;
5240 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
5241 if (RExC_rx->minlen<*minnextp)
5242 RExC_rx->minlen=*minnextp;
5243 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
5244 SvREFCNT_dec_NN(data_fake.last_found);
5246 if ( data_fake.minlen_fixed != minlenp )
5248 data->offset_fixed= data_fake.offset_fixed;
5249 data->minlen_fixed= data_fake.minlen_fixed;
5250 data->lookbehind_fixed+= scan->flags;
5252 if ( data_fake.minlen_float != minlenp )
5254 data->minlen_float= data_fake.minlen_float;
5255 data->offset_float_min=data_fake.offset_float_min;
5256 data->offset_float_max=data_fake.offset_float_max;
5257 data->lookbehind_float+= scan->flags;
5264 else if (OP(scan) == OPEN) {
5265 if (stopparen != (I32)ARG(scan))
5268 else if (OP(scan) == CLOSE) {
5269 if (stopparen == (I32)ARG(scan)) {
5272 if ((I32)ARG(scan) == is_par) {
5273 next = regnext(scan);
5275 if ( next && (OP(next) != WHILEM) && next < last)
5276 is_par = 0; /* Disable optimization */
5279 *(data->last_closep) = ARG(scan);
5281 else if (OP(scan) == EVAL) {
5283 data->flags |= SF_HAS_EVAL;
5285 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
5286 if (flags & SCF_DO_SUBSTR) {
5287 scan_commit(pRExC_state, data, minlenp, is_inf);
5288 flags &= ~SCF_DO_SUBSTR;
5290 if (data && OP(scan)==ACCEPT) {
5291 data->flags |= SCF_SEEN_ACCEPT;
5296 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
5298 if (flags & SCF_DO_SUBSTR) {
5299 scan_commit(pRExC_state, data, minlenp, is_inf);
5300 data->longest = &(data->longest_float);
5302 is_inf = is_inf_internal = 1;
5303 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
5304 ssc_anything(data->start_class);
5305 flags &= ~SCF_DO_STCLASS;
5307 else if (OP(scan) == GPOS) {
5308 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
5309 !(delta || is_inf || (data && data->pos_delta)))
5311 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
5312 RExC_rx->intflags |= PREGf_ANCH_GPOS;
5313 if (RExC_rx->gofs < (STRLEN)min)
5314 RExC_rx->gofs = min;
5316 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
5320 #ifdef TRIE_STUDY_OPT
5321 #ifdef FULL_TRIE_STUDY
5322 else if (PL_regkind[OP(scan)] == TRIE) {
5323 /* NOTE - There is similar code to this block above for handling
5324 BRANCH nodes on the initial study. If you change stuff here
5326 regnode *trie_node= scan;
5327 regnode *tail= regnext(scan);
5328 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5329 SSize_t max1 = 0, min1 = SSize_t_MAX;
5332 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
5333 /* Cannot merge strings after this. */
5334 scan_commit(pRExC_state, data, minlenp, is_inf);
5336 if (flags & SCF_DO_STCLASS)
5337 ssc_init_zero(pRExC_state, &accum);
5343 const regnode *nextbranch= NULL;
5346 for ( word=1 ; word <= trie->wordcount ; word++)
5348 SSize_t deltanext=0, minnext=0, f = 0, fake;
5349 regnode_ssc this_class;
5351 data_fake.flags = 0;
5353 data_fake.whilem_c = data->whilem_c;
5354 data_fake.last_closep = data->last_closep;
5357 data_fake.last_closep = &fake;
5358 data_fake.pos_delta = delta;
5359 if (flags & SCF_DO_STCLASS) {
5360 ssc_init(pRExC_state, &this_class);
5361 data_fake.start_class = &this_class;
5362 f = SCF_DO_STCLASS_AND;
5364 if (flags & SCF_WHILEM_VISITED_POS)
5365 f |= SCF_WHILEM_VISITED_POS;
5367 if (trie->jump[word]) {
5369 nextbranch = trie_node + trie->jump[0];
5370 scan= trie_node + trie->jump[word];
5371 /* We go from the jump point to the branch that follows
5372 it. Note this means we need the vestigal unused
5373 branches even though they arent otherwise used. */
5374 minnext = study_chunk(pRExC_state, &scan, minlenp,
5375 &deltanext, (regnode *)nextbranch, &data_fake,
5376 stopparen, recursed_depth, NULL, f,depth+1);
5378 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
5379 nextbranch= regnext((regnode*)nextbranch);
5381 if (min1 > (SSize_t)(minnext + trie->minlen))
5382 min1 = minnext + trie->minlen;
5383 if (deltanext == SSize_t_MAX) {
5384 is_inf = is_inf_internal = 1;
5386 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
5387 max1 = minnext + deltanext + trie->maxlen;
5389 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
5391 if (data_fake.flags & SCF_SEEN_ACCEPT) {
5392 if ( stopmin > min + min1)
5393 stopmin = min + min1;
5394 flags &= ~SCF_DO_SUBSTR;
5396 data->flags |= SCF_SEEN_ACCEPT;
5399 if (data_fake.flags & SF_HAS_EVAL)
5400 data->flags |= SF_HAS_EVAL;
5401 data->whilem_c = data_fake.whilem_c;
5403 if (flags & SCF_DO_STCLASS)
5404 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
5407 if (flags & SCF_DO_SUBSTR) {
5408 data->pos_min += min1;
5409 data->pos_delta += max1 - min1;
5410 if (max1 != min1 || is_inf)
5411 data->longest = &(data->longest_float);
5414 delta += max1 - min1;
5415 if (flags & SCF_DO_STCLASS_OR) {
5416 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5418 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5419 flags &= ~SCF_DO_STCLASS;
5422 else if (flags & SCF_DO_STCLASS_AND) {
5424 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
5425 flags &= ~SCF_DO_STCLASS;
5428 /* Switch to OR mode: cache the old value of
5429 * data->start_class */
5431 StructCopy(data->start_class, and_withp, regnode_ssc);
5432 flags &= ~SCF_DO_STCLASS_AND;
5433 StructCopy(&accum, data->start_class, regnode_ssc);
5434 flags |= SCF_DO_STCLASS_OR;
5441 else if (PL_regkind[OP(scan)] == TRIE) {
5442 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
5445 min += trie->minlen;
5446 delta += (trie->maxlen - trie->minlen);
5447 flags &= ~SCF_DO_STCLASS; /* xxx */
5448 if (flags & SCF_DO_SUBSTR) {
5449 /* Cannot expect anything... */
5450 scan_commit(pRExC_state, data, minlenp, is_inf);
5451 data->pos_min += trie->minlen;
5452 data->pos_delta += (trie->maxlen - trie->minlen);
5453 if (trie->maxlen != trie->minlen)
5454 data->longest = &(data->longest_float);
5456 if (trie->jump) /* no more substrings -- for now /grr*/
5457 flags &= ~SCF_DO_SUBSTR;
5459 #endif /* old or new */
5460 #endif /* TRIE_STUDY_OPT */
5462 /* Else: zero-length, ignore. */
5463 scan = regnext(scan);
5465 /* If we are exiting a recursion we can unset its recursed bit
5466 * and allow ourselves to enter it again - no danger of an
5467 * infinite loop there.
5468 if (stopparen > -1 && recursed) {
5469 DEBUG_STUDYDATA("unset:", data,depth);
5470 PAREN_UNSET( recursed, stopparen);
5474 DEBUG_STUDYDATA("frame-end:",data,depth);
5475 DEBUG_PEEP("fend", scan, depth);
5476 /* restore previous context */
5479 stopparen = frame->stop;
5480 recursed_depth = frame->prev_recursed_depth;
5483 frame = frame->prev;
5484 goto fake_study_recurse;
5489 DEBUG_STUDYDATA("pre-fin:",data,depth);
5492 *deltap = is_inf_internal ? SSize_t_MAX : delta;
5494 if (flags & SCF_DO_SUBSTR && is_inf)
5495 data->pos_delta = SSize_t_MAX - data->pos_min;
5496 if (is_par > (I32)U8_MAX)
5498 if (is_par && pars==1 && data) {
5499 data->flags |= SF_IN_PAR;
5500 data->flags &= ~SF_HAS_PAR;
5502 else if (pars && data) {
5503 data->flags |= SF_HAS_PAR;
5504 data->flags &= ~SF_IN_PAR;
5506 if (flags & SCF_DO_STCLASS_OR)
5507 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
5508 if (flags & SCF_TRIE_RESTUDY)
5509 data->flags |= SCF_TRIE_RESTUDY;
5511 DEBUG_STUDYDATA("post-fin:",data,depth);
5514 SSize_t final_minlen= min < stopmin ? min : stopmin;
5516 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
5517 RExC_maxlen = final_minlen + delta;
5519 return final_minlen;
5525 S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n)
5527 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
5529 PERL_ARGS_ASSERT_ADD_DATA;
5531 Renewc(RExC_rxi->data,
5532 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
5533 char, struct reg_data);
5535 Renew(RExC_rxi->data->what, count + n, U8);
5537 Newx(RExC_rxi->data->what, n, U8);
5538 RExC_rxi->data->count = count + n;
5539 Copy(s, RExC_rxi->data->what + count, n, U8);
5543 /*XXX: todo make this not included in a non debugging perl, but appears to be
5544 * used anyway there, in 'use re' */
5545 #ifndef PERL_IN_XSUB_RE
5547 Perl_reginitcolors(pTHX)
5549 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
5551 char *t = savepv(s);
5555 t = strchr(t, '\t');
5561 PL_colors[i] = t = (char *)"";
5566 PL_colors[i++] = (char *)"";
5573 #ifdef TRIE_STUDY_OPT
5574 #define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \
5577 (data.flags & SCF_TRIE_RESTUDY) \
5585 #define CHECK_RESTUDY_GOTO_butfirst
5589 * pregcomp - compile a regular expression into internal code
5591 * Decides which engine's compiler to call based on the hint currently in
5595 #ifndef PERL_IN_XSUB_RE
5597 /* return the currently in-scope regex engine (or the default if none) */
5599 regexp_engine const *
5600 Perl_current_re_engine(pTHX)
5602 if (IN_PERL_COMPILETIME) {
5603 HV * const table = GvHV(PL_hintgv);
5606 if (!table || !(PL_hints & HINT_LOCALIZE_HH))
5607 return &reh_regexp_engine;
5608 ptr = hv_fetchs(table, "regcomp", FALSE);
5609 if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
5610 return &reh_regexp_engine;
5611 return INT2PTR(regexp_engine*,SvIV(*ptr));
5615 if (!PL_curcop->cop_hints_hash)
5616 return &reh_regexp_engine;
5617 ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
5618 if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
5619 return &reh_regexp_engine;
5620 return INT2PTR(regexp_engine*,SvIV(ptr));
5626 Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
5628 regexp_engine const *eng = current_re_engine();
5629 GET_RE_DEBUG_FLAGS_DECL;
5631 PERL_ARGS_ASSERT_PREGCOMP;
5633 /* Dispatch a request to compile a regexp to correct regexp engine. */
5635 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
5638 return CALLREGCOMP_ENG(eng, pattern, flags);
5642 /* public(ish) entry point for the perl core's own regex compiling code.
5643 * It's actually a wrapper for Perl_re_op_compile that only takes an SV
5644 * pattern rather than a list of OPs, and uses the internal engine rather
5645 * than the current one */
5648 Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
5650 SV *pat = pattern; /* defeat constness! */
5651 PERL_ARGS_ASSERT_RE_COMPILE;
5652 return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
5653 #ifdef PERL_IN_XSUB_RE
5658 NULL, NULL, rx_flags, 0);
5662 /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code
5663 * blocks, recalculate the indices. Update pat_p and plen_p in-place to
5664 * point to the realloced string and length.
5666 * This is essentially a copy of Perl_bytes_to_utf8() with the code index
5670 S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
5671 char **pat_p, STRLEN *plen_p, int num_code_blocks)
5673 U8 *const src = (U8*)*pat_p;
5678 GET_RE_DEBUG_FLAGS_DECL;
5680 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
5681 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
5683 Newx(dst, *plen_p * 2 + 1, U8);
5686 while (s < *plen_p) {
5687 append_utf8_from_native_byte(src[s], &d);
5688 if (n < num_code_blocks) {
5689 if (!do_end && pRExC_state->code_blocks[n].start == s) {
5690 pRExC_state->code_blocks[n].start = d - dst - 1;
5691 assert(*(d - 1) == '(');
5694 else if (do_end && pRExC_state->code_blocks[n].end == s) {
5695 pRExC_state->code_blocks[n].end = d - dst - 1;
5696 assert(*(d - 1) == ')');
5705 *pat_p = (char*) dst;
5707 RExC_orig_utf8 = RExC_utf8 = 1;
5712 /* S_concat_pat(): concatenate a list of args to the pattern string pat,
5713 * while recording any code block indices, and handling overloading,
5714 * nested qr// objects etc. If pat is null, it will allocate a new
5715 * string, or just return the first arg, if there's only one.
5717 * Returns the malloced/updated pat.
5718 * patternp and pat_count is the array of SVs to be concatted;
5719 * oplist is the optional list of ops that generated the SVs;
5720 * recompile_p is a pointer to a boolean that will be set if
5721 * the regex will need to be recompiled.
5722 * delim, if non-null is an SV that will be inserted between each element
5726 S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
5727 SV *pat, SV ** const patternp, int pat_count,
5728 OP *oplist, bool *recompile_p, SV *delim)
5732 bool use_delim = FALSE;
5733 bool alloced = FALSE;
5735 /* if we know we have at least two args, create an empty string,
5736 * then concatenate args to that. For no args, return an empty string */
5737 if (!pat && pat_count != 1) {
5743 for (svp = patternp; svp < patternp + pat_count; svp++) {
5746 STRLEN orig_patlen = 0;
5748 SV *msv = use_delim ? delim : *svp;
5749 if (!msv) msv = &PL_sv_undef;
5751 /* if we've got a delimiter, we go round the loop twice for each
5752 * svp slot (except the last), using the delimiter the second
5761 if (SvTYPE(msv) == SVt_PVAV) {
5762 /* we've encountered an interpolated array within
5763 * the pattern, e.g. /...@a..../. Expand the list of elements,
5764 * then recursively append elements.
5765 * The code in this block is based on S_pushav() */
5767 AV *const av = (AV*)msv;
5768 const SSize_t maxarg = AvFILL(av) + 1;
5772 assert(oplist->op_type == OP_PADAV
5773 || oplist->op_type == OP_RV2AV);
5774 oplist = OP_SIBLING(oplist);
5777 if (SvRMAGICAL(av)) {
5780 Newx(array, maxarg, SV*);
5782 for (i=0; i < maxarg; i++) {
5783 SV ** const svp = av_fetch(av, i, FALSE);
5784 array[i] = svp ? *svp : &PL_sv_undef;
5788 array = AvARRAY(av);
5790 pat = S_concat_pat(aTHX_ pRExC_state, pat,
5791 array, maxarg, NULL, recompile_p,
5793 GvSV((gv_fetchpvs("\"", GV_ADDMULTI, SVt_PV))));
5799 /* we make the assumption here that each op in the list of
5800 * op_siblings maps to one SV pushed onto the stack,
5801 * except for code blocks, with have both an OP_NULL and
5803 * This allows us to match up the list of SVs against the
5804 * list of OPs to find the next code block.
5806 * Note that PUSHMARK PADSV PADSV ..
5808 * PADRANGE PADSV PADSV ..
5809 * so the alignment still works. */
5812 if (oplist->op_type == OP_NULL
5813 && (oplist->op_flags & OPf_SPECIAL))
5815 assert(n < pRExC_state->num_code_blocks);
5816 pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0;
5817 pRExC_state->code_blocks[n].block = oplist;
5818 pRExC_state->code_blocks[n].src_regex = NULL;
5821 oplist = OP_SIBLING(oplist); /* skip CONST */
5824 oplist = OP_SIBLING(oplist);;
5827 /* apply magic and QR overloading to arg */
5830 if (SvROK(msv) && SvAMAGIC(msv)) {
5831 SV *sv = AMG_CALLunary(msv, regexp_amg);
5835 if (SvTYPE(sv) != SVt_REGEXP)
5836 Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
5841 /* try concatenation overload ... */
5842 if (pat && (SvAMAGIC(pat) || SvAMAGIC(msv)) &&
5843 (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
5846 /* overloading involved: all bets are off over literal
5847 * code. Pretend we haven't seen it */
5848 pRExC_state->num_code_blocks -= n;
5852 /* ... or failing that, try "" overload */
5853 while (SvAMAGIC(msv)
5854 && (sv = AMG_CALLunary(msv, string_amg))
5858 && SvRV(msv) == SvRV(sv))
5863 if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
5867 /* this is a partially unrolled
5868 * sv_catsv_nomg(pat, msv);
5869 * that allows us to adjust code block indices if
5872 char *dst = SvPV_force_nomg(pat, dlen);
5874 if (SvUTF8(msv) && !SvUTF8(pat)) {
5875 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &dst, &dlen, n);
5876 sv_setpvn(pat, dst, dlen);
5879 sv_catsv_nomg(pat, msv);
5886 pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
5889 /* extract any code blocks within any embedded qr//'s */
5890 if (rx && SvTYPE(rx) == SVt_REGEXP
5891 && RX_ENGINE((REGEXP*)rx)->op_comp)
5894 RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
5895 if (ri->num_code_blocks) {
5897 /* the presence of an embedded qr// with code means
5898 * we should always recompile: the text of the
5899 * qr// may not have changed, but it may be a
5900 * different closure than last time */
5902 Renew(pRExC_state->code_blocks,
5903 pRExC_state->num_code_blocks + ri->num_code_blocks,
5904 struct reg_code_block);
5905 pRExC_state->num_code_blocks += ri->num_code_blocks;
5907 for (i=0; i < ri->num_code_blocks; i++) {
5908 struct reg_code_block *src, *dst;
5909 STRLEN offset = orig_patlen
5910 + ReANY((REGEXP *)rx)->pre_prefix;
5911 assert(n < pRExC_state->num_code_blocks);
5912 src = &ri->code_blocks[i];
5913 dst = &pRExC_state->code_blocks[n];
5914 dst->start = src->start + offset;
5915 dst->end = src->end + offset;
5916 dst->block = src->block;
5917 dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
5926 /* avoid calling magic multiple times on a single element e.g. =~ $qr */
5935 /* see if there are any run-time code blocks in the pattern.
5936 * False positives are allowed */
5939 S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5940 char *pat, STRLEN plen)
5945 PERL_UNUSED_CONTEXT;
5947 for (s = 0; s < plen; s++) {
5948 if (n < pRExC_state->num_code_blocks
5949 && s == pRExC_state->code_blocks[n].start)
5951 s = pRExC_state->code_blocks[n].end;
5955 /* TODO ideally should handle [..], (#..), /#.../x to reduce false
5957 if (pat[s] == '(' && s+2 <= plen && pat[s+1] == '?' &&
5959 || (s + 2 <= plen && pat[s+2] == '?' && pat[s+3] == '{'))
5966 /* Handle run-time code blocks. We will already have compiled any direct
5967 * or indirect literal code blocks. Now, take the pattern 'pat' and make a
5968 * copy of it, but with any literal code blocks blanked out and
5969 * appropriate chars escaped; then feed it into
5971 * eval "qr'modified_pattern'"
5975 * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
5979 * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno'
5981 * After eval_sv()-ing that, grab any new code blocks from the returned qr
5982 * and merge them with any code blocks of the original regexp.
5984 * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
5985 * instead, just save the qr and return FALSE; this tells our caller that
5986 * the original pattern needs upgrading to utf8.
5990 S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
5991 char *pat, STRLEN plen)
5995 GET_RE_DEBUG_FLAGS_DECL;
5997 if (pRExC_state->runtime_code_qr) {
5998 /* this is the second time we've been called; this should
5999 * only happen if the main pattern got upgraded to utf8
6000 * during compilation; re-use the qr we compiled first time
6001 * round (which should be utf8 too)
6003 qr = pRExC_state->runtime_code_qr;
6004 pRExC_state->runtime_code_qr = NULL;
6005 assert(RExC_utf8 && SvUTF8(qr));
6011 int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
6015 /* determine how many extra chars we need for ' and \ escaping */
6016 for (s = 0; s < plen; s++) {
6017 if (pat[s] == '\'' || pat[s] == '\\')
6021 Newx(newpat, newlen, char);
6023 *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
6025 for (s = 0; s < plen; s++) {
6026 if (n < pRExC_state->num_code_blocks
6027 && s == pRExC_state->code_blocks[n].start)
6029 /* blank out literal code block */
6030 assert(pat[s] == '(');
6031 while (s <= pRExC_state->code_blocks[n].end) {
6039 if (pat[s] == '\'' || pat[s] == '\\')
6044 if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
6048 PerlIO_printf(Perl_debug_log,
6049 "%sre-parsing pattern for runtime code:%s %s\n",
6050 PL_colors[4],PL_colors[5],newpat);
6053 sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
6058 PUSHSTACKi(PERLSI_REQUIRE);
6059 /* G_RE_REPARSING causes the toker to collapse \\ into \ when
6060 * parsing qr''; normally only q'' does this. It also alters
6062 eval_sv(sv, G_SCALAR|G_RE_REPARSING);
6063 SvREFCNT_dec_NN(sv);
6068 SV * const errsv = ERRSV;
6069 if (SvTRUE_NN(errsv))
6071 Safefree(pRExC_state->code_blocks);
6072 /* use croak_sv ? */
6073 Perl_croak_nocontext("%"SVf, SVfARG(errsv));
6076 assert(SvROK(qr_ref));
6078 assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
6079 /* the leaving below frees the tmp qr_ref.
6080 * Give qr a life of its own */
6088 if (!RExC_utf8 && SvUTF8(qr)) {
6089 /* first time through; the pattern got upgraded; save the
6090 * qr for the next time through */
6091 assert(!pRExC_state->runtime_code_qr);
6092 pRExC_state->runtime_code_qr = qr;
6097 /* extract any code blocks within the returned qr// */
6100 /* merge the main (r1) and run-time (r2) code blocks into one */
6102 RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
6103 struct reg_code_block *new_block, *dst;
6104 RExC_state_t * const r1 = pRExC_state; /* convenient alias */
6107 if (!r2->num_code_blocks) /* we guessed wrong */
6109 SvREFCNT_dec_NN(qr);
6114 r1->num_code_blocks + r2->num_code_blocks,
6115 struct reg_code_block);
6118 while ( i1 < r1->num_code_blocks
6119 || i2 < r2->num_code_blocks)
6121 struct reg_code_block *src;
6124 if (i1 == r1->num_code_blocks) {
6125 src = &r2->code_blocks[i2++];
6128 else if (i2 == r2->num_code_blocks)
6129 src = &r1->code_blocks[i1++];
6130 else if ( r1->code_blocks[i1].start
6131 < r2->code_blocks[i2].start)
6133 src = &r1->code_blocks[i1++];
6134 assert(src->end < r2->code_blocks[i2].start);
6137 assert( r1->code_blocks[i1].start
6138 > r2->code_blocks[i2].start);
6139 src = &r2->code_blocks[i2++];
6141 assert(src->end < r1->code_blocks[i1].start);
6144 assert(pat[src->start] == '(');
6145 assert(pat[src->end] == ')');
6146 dst->start = src->start;
6147 dst->end = src->end;
6148 dst->block = src->block;
6149 dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
6153 r1->num_code_blocks += r2->num_code_blocks;
6154 Safefree(r1->code_blocks);
6155 r1->code_blocks = new_block;
6158 SvREFCNT_dec_NN(qr);
6164 S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
6165 SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
6166 SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
6167 STRLEN longest_length, bool eol, bool meol)
6169 /* This is the common code for setting up the floating and fixed length
6170 * string data extracted from Perl_re_op_compile() below. Returns a boolean
6171 * as to whether succeeded or not */
6176 if (! (longest_length
6177 || (eol /* Can't have SEOL and MULTI */
6178 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
6180 /* See comments for join_exact for why REG_UNFOLDED_MULTI_SEEN */
6181 || (RExC_seen & REG_UNFOLDED_MULTI_SEEN))
6186 /* copy the information about the longest from the reg_scan_data
6187 over to the program. */
6188 if (SvUTF8(sv_longest)) {
6189 *rx_utf8 = sv_longest;
6192 *rx_substr = sv_longest;
6195 /* end_shift is how many chars that must be matched that
6196 follow this item. We calculate it ahead of time as once the
6197 lookbehind offset is added in we lose the ability to correctly
6199 ml = minlen ? *(minlen) : (SSize_t)longest_length;
6200 *rx_end_shift = ml - offset
6201 - longest_length + (SvTAIL(sv_longest) != 0)
6204 t = (eol/* Can't have SEOL and MULTI */
6205 && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
6206 fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
6212 * Perl_re_op_compile - the perl internal RE engine's function to compile a
6213 * regular expression into internal code.
6214 * The pattern may be passed either as:
6215 * a list of SVs (patternp plus pat_count)
6216 * a list of OPs (expr)
6217 * If both are passed, the SV list is used, but the OP list indicates
6218 * which SVs are actually pre-compiled code blocks
6220 * The SVs in the list have magic and qr overloading applied to them (and
6221 * the list may be modified in-place with replacement SVs in the latter
6224 * If the pattern hasn't changed from old_re, then old_re will be
6227 * eng is the current engine. If that engine has an op_comp method, then
6228 * handle directly (i.e. we assume that op_comp was us); otherwise, just
6229 * do the initial concatenation of arguments and pass on to the external
6232 * If is_bare_re is not null, set it to a boolean indicating whether the
6233 * arg list reduced (after overloading) to a single bare regex which has
6234 * been returned (i.e. /$qr/).
6236 * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
6238 * pm_flags contains the PMf_* flags, typically based on those from the
6239 * pm_flags field of the related PMOP. Currently we're only interested in
6240 * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
6242 * We can't allocate space until we know how big the compiled form will be,
6243 * but we can't compile it (and thus know how big it is) until we've got a
6244 * place to put the code. So we cheat: we compile it twice, once with code
6245 * generation turned off and size counting turned on, and once "for real".
6246 * This also means that we don't allocate space until we are sure that the
6247 * thing really will compile successfully, and we never have to move the
6248 * code and thus invalidate pointers into it. (Note that it has to be in
6249 * one piece because free() must be able to free it all.) [NB: not true in perl]
6251 * Beware that the optimization-preparation code in here knows about some
6252 * of the structure of the compiled regexp. [I'll say.]
6256 Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
6257 OP *expr, const regexp_engine* eng, REGEXP *old_re,
6258 bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
6262 regexp_internal *ri;
6270 SV *code_blocksv = NULL;
6271 SV** new_patternp = patternp;
6273 /* these are all flags - maybe they should be turned
6274 * into a single int with different bit masks */
6275 I32 sawlookahead = 0;
6280 regex_charset initial_charset = get_regex_charset(orig_rx_flags);
6282 bool runtime_code = 0;
6284 RExC_state_t RExC_state;
6285 RExC_state_t * const pRExC_state = &RExC_state;
6286 #ifdef TRIE_STUDY_OPT
6288 RExC_state_t copyRExC_state;
6290 GET_RE_DEBUG_FLAGS_DECL;
6292 PERL_ARGS_ASSERT_RE_OP_COMPILE;
6294 DEBUG_r(if (!PL_colorset) reginitcolors());
6296 #ifndef PERL_IN_XSUB_RE
6297 /* Initialize these here instead of as-needed, as is quick and avoids
6298 * having to test them each time otherwise */
6299 if (! PL_AboveLatin1) {
6300 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
6301 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
6302 PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist);
6303 PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist);
6304 PL_HasMultiCharFold =
6305 _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist);
6307 /* This is calculated here, because the Perl program that generates the
6308 * static global ones doesn't currently have access to
6309 * NUM_ANYOF_CODE_POINTS */
6310 PL_InBitmap = _new_invlist(2);
6311 PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0,
6312 NUM_ANYOF_CODE_POINTS - 1);
6316 pRExC_state->code_blocks = NULL;
6317 pRExC_state->num_code_blocks = 0;
6320 *is_bare_re = FALSE;
6322 if (expr && (expr->op_type == OP_LIST ||
6323 (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
6324 /* allocate code_blocks if needed */
6328 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
6329 if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
6330 ncode++; /* count of DO blocks */
6332 pRExC_state->num_code_blocks = ncode;
6333 Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
6338 /* compile-time pattern with just OP_CONSTs and DO blocks */
6343 /* find how many CONSTs there are */
6346 if (expr->op_type == OP_CONST)
6349 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6350 if (o->op_type == OP_CONST)
6354 /* fake up an SV array */
6356 assert(!new_patternp);
6357 Newx(new_patternp, n, SV*);
6358 SAVEFREEPV(new_patternp);
6362 if (expr->op_type == OP_CONST)
6363 new_patternp[n] = cSVOPx_sv(expr);
6365 for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
6366 if (o->op_type == OP_CONST)
6367 new_patternp[n++] = cSVOPo_sv;
6372 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6373 "Assembling pattern from %d elements%s\n", pat_count,
6374 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6376 /* set expr to the first arg op */
6378 if (pRExC_state->num_code_blocks
6379 && expr->op_type != OP_CONST)
6381 expr = cLISTOPx(expr)->op_first;
6382 assert( expr->op_type == OP_PUSHMARK
6383 || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
6384 || expr->op_type == OP_PADRANGE);
6385 expr = OP_SIBLING(expr);
6388 pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
6389 expr, &recompile, NULL);
6391 /* handle bare (possibly after overloading) regex: foo =~ $re */
6396 if (SvTYPE(re) == SVt_REGEXP) {
6400 Safefree(pRExC_state->code_blocks);
6401 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
6402 "Precompiled pattern%s\n",
6403 orig_rx_flags & RXf_SPLIT ? " for split" : ""));
6409 exp = SvPV_nomg(pat, plen);
6411 if (!eng->op_comp) {
6412 if ((SvUTF8(pat) && IN_BYTES)
6413 || SvGMAGICAL(pat) || SvAMAGIC(pat))
6415 /* make a temporary copy; either to convert to bytes,
6416 * or to avoid repeating get-magic / overloaded stringify */
6417 pat = newSVpvn_flags(exp, plen, SVs_TEMP |
6418 (IN_BYTES ? 0 : SvUTF8(pat)));
6420 Safefree(pRExC_state->code_blocks);
6421 return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
6424 /* ignore the utf8ness if the pattern is 0 length */
6425 RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
6426 RExC_uni_semantics = 0;
6427 RExC_contains_locale = 0;
6428 RExC_contains_i = 0;
6429 pRExC_state->runtime_code_qr = NULL;
6432 SV *dsv= sv_newmortal();
6433 RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
6434 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
6435 PL_colors[4],PL_colors[5],s);
6439 /* we jump here if we upgrade the pattern to utf8 and have to
6442 if ((pm_flags & PMf_USE_RE_EVAL)
6443 /* this second condition covers the non-regex literal case,
6444 * i.e. $foo =~ '(?{})'. */
6445 || (IN_PERL_COMPILETIME && (PL_hints & HINT_RE_EVAL))
6447 runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
6449 /* return old regex if pattern hasn't changed */
6450 /* XXX: note in the below we have to check the flags as well as the
6453 * Things get a touch tricky as we have to compare the utf8 flag
6454 * independently from the compile flags. */
6458 && !!RX_UTF8(old_re) == !!RExC_utf8
6459 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
6460 && RX_PRECOMP(old_re)
6461 && RX_PRELEN(old_re) == plen
6462 && memEQ(RX_PRECOMP(old_re), exp, plen)
6463 && !runtime_code /* with runtime code, always recompile */ )
6465 Safefree(pRExC_state->code_blocks);
6469 rx_flags = orig_rx_flags;
6471 if (rx_flags & PMf_FOLD) {
6472 RExC_contains_i = 1;
6474 if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
6476 /* Set to use unicode semantics if the pattern is in utf8 and has the
6477 * 'depends' charset specified, as it means unicode when utf8 */
6478 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6482 RExC_flags = rx_flags;
6483 RExC_pm_flags = pm_flags;
6486 if (TAINTING_get && TAINT_get)
6487 Perl_croak(aTHX_ "Eval-group in insecure regular expression");
6489 if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
6490 /* whoops, we have a non-utf8 pattern, whilst run-time code
6491 * got compiled as utf8. Try again with a utf8 pattern */
6492 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6493 pRExC_state->num_code_blocks);
6494 goto redo_first_pass;
6497 assert(!pRExC_state->runtime_code_qr);
6503 RExC_in_lookbehind = 0;
6504 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
6506 RExC_override_recoding = 0;
6507 RExC_in_multi_char_class = 0;
6509 /* First pass: determine size, legality. */
6512 RExC_end = exp + plen;
6517 RExC_emit = (regnode *) &RExC_emit_dummy;
6518 RExC_whilem_seen = 0;
6519 RExC_open_parens = NULL;
6520 RExC_close_parens = NULL;
6522 RExC_paren_names = NULL;
6524 RExC_paren_name_list = NULL;
6526 RExC_recurse = NULL;
6527 RExC_study_chunk_recursed = NULL;
6528 RExC_study_chunk_recursed_bytes= 0;
6529 RExC_recurse_count = 0;
6530 pRExC_state->code_index = 0;
6532 #if 0 /* REGC() is (currently) a NOP at the first pass.
6533 * Clever compilers notice this and complain. --jhi */
6534 REGC((U8)REG_MAGIC, (char*)RExC_emit);
6537 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
6539 RExC_lastparse=NULL;
6541 /* reg may croak on us, not giving us a chance to free
6542 pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
6543 need it to survive as long as the regexp (qr/(?{})/).
6544 We must check that code_blocksv is not already set, because we may
6545 have jumped back to restart the sizing pass. */
6546 if (pRExC_state->code_blocks && !code_blocksv) {
6547 code_blocksv = newSV_type(SVt_PV);
6548 SAVEFREESV(code_blocksv);
6549 SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
6550 SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
6552 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6553 /* It's possible to write a regexp in ascii that represents Unicode
6554 codepoints outside of the byte range, such as via \x{100}. If we
6555 detect such a sequence we have to convert the entire pattern to utf8
6556 and then recompile, as our sizing calculation will have been based
6557 on 1 byte == 1 character, but we will need to use utf8 to encode
6558 at least some part of the pattern, and therefore must convert the whole
6561 if (flags & RESTART_UTF8) {
6562 S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
6563 pRExC_state->num_code_blocks);
6564 goto redo_first_pass;
6566 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
6569 SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
6572 PerlIO_printf(Perl_debug_log,
6573 "Required size %"IVdf" nodes\n"
6574 "Starting second pass (creation)\n",
6577 RExC_lastparse=NULL;
6580 /* The first pass could have found things that force Unicode semantics */
6581 if ((RExC_utf8 || RExC_uni_semantics)
6582 && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
6584 set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
6587 /* Small enough for pointer-storage convention?
6588 If extralen==0, this means that we will not need long jumps. */
6589 if (RExC_size >= 0x10000L && RExC_extralen)
6590 RExC_size += RExC_extralen;
6593 if (RExC_whilem_seen > 15)
6594 RExC_whilem_seen = 15;
6596 /* Allocate space and zero-initialize. Note, the two step process
6597 of zeroing when in debug mode, thus anything assigned has to
6598 happen after that */
6599 rx = (REGEXP*) newSV_type(SVt_REGEXP);
6601 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6602 char, regexp_internal);
6603 if ( r == NULL || ri == NULL )
6604 FAIL("Regexp out of space");
6606 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
6607 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
6610 /* bulk initialize base fields with 0. */
6611 Zero(ri, sizeof(regexp_internal), char);
6614 /* non-zero initialization begins here */
6617 r->extflags = rx_flags;
6618 RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
6620 if (pm_flags & PMf_IS_QR) {
6621 ri->code_blocks = pRExC_state->code_blocks;
6622 ri->num_code_blocks = pRExC_state->num_code_blocks;
6627 for (n = 0; n < pRExC_state->num_code_blocks; n++)
6628 if (pRExC_state->code_blocks[n].src_regex)
6629 SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
6630 SAVEFREEPV(pRExC_state->code_blocks);
6634 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
6635 bool has_charset = (get_regex_charset(r->extflags)
6636 != REGEX_DEPENDS_CHARSET);
6638 /* The caret is output if there are any defaults: if not all the STD
6639 * flags are set, or if no character set specifier is needed */
6641 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
6643 bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
6644 == REG_RUN_ON_COMMENT_SEEN);
6645 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
6646 >> RXf_PMf_STD_PMMOD_SHIFT);
6647 const char *fptr = STD_PAT_MODS; /*"msix"*/
6649 /* Allocate for the worst case, which is all the std flags are turned
6650 * on. If more precision is desired, we could do a population count of
6651 * the flags set. This could be done with a small lookup table, or by
6652 * shifting, masking and adding, or even, when available, assembly
6653 * language for a machine-language population count.
6654 * We never output a minus, as all those are defaults, so are
6655 * covered by the caret */
6656 const STRLEN wraplen = plen + has_p + has_runon
6657 + has_default /* If needs a caret */
6659 /* If needs a character set specifier */
6660 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
6661 + (sizeof(STD_PAT_MODS) - 1)
6662 + (sizeof("(?:)") - 1);
6664 Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
6665 r->xpv_len_u.xpvlenu_pv = p;
6667 SvFLAGS(rx) |= SVf_UTF8;
6670 /* If a default, cover it using the caret */
6672 *p++= DEFAULT_PAT_MOD;
6676 const char* const name = get_regex_charset_name(r->extflags, &len);
6677 Copy(name, p, len, char);
6681 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
6684 while((ch = *fptr++)) {
6692 Copy(RExC_precomp, p, plen, char);
6693 assert ((RX_WRAPPED(rx) - p) < 16);
6694 r->pre_prefix = p - RX_WRAPPED(rx);
6700 SvCUR_set(rx, p - RX_WRAPPED(rx));
6704 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
6706 /* setup various meta data about recursion, this all requires
6707 * RExC_npar to be correctly set, and a bit later on we clear it */
6708 if (RExC_seen & REG_RECURSE_SEEN) {
6709 Newxz(RExC_open_parens, RExC_npar,regnode *);
6710 SAVEFREEPV(RExC_open_parens);
6711 Newxz(RExC_close_parens,RExC_npar,regnode *);
6712 SAVEFREEPV(RExC_close_parens);
6714 if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) {
6715 /* Note, RExC_npar is 1 + the number of parens in a pattern.
6716 * So its 1 if there are no parens. */
6717 RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
6718 ((RExC_npar & 0x07) != 0);
6719 Newx(RExC_study_chunk_recursed,
6720 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6721 SAVEFREEPV(RExC_study_chunk_recursed);
6724 /* Useful during FAIL. */
6725 #ifdef RE_TRACK_PATTERN_OFFSETS
6726 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
6727 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
6728 "%s %"UVuf" bytes for offset annotations.\n",
6729 ri->u.offsets ? "Got" : "Couldn't get",
6730 (UV)((2*RExC_size+1) * sizeof(U32))));
6732 SetProgLen(ri,RExC_size);
6736 REH_CALL_COMP_BEGIN_HOOK(pRExC_state->rx);
6738 /* Second pass: emit code. */
6739 RExC_flags = rx_flags; /* don't let top level (?i) bleed */
6740 RExC_pm_flags = pm_flags;
6742 RExC_end = exp + plen;
6745 RExC_emit_start = ri->program;
6746 RExC_emit = ri->program;
6747 RExC_emit_bound = ri->program + RExC_size + 1;
6748 pRExC_state->code_index = 0;
6750 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
6751 if (reg(pRExC_state, 0, &flags,1) == NULL) {
6753 Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
6755 /* XXXX To minimize changes to RE engine we always allocate
6756 3-units-long substrs field. */
6757 Newx(r->substrs, 1, struct reg_substr_data);
6758 if (RExC_recurse_count) {
6759 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
6760 SAVEFREEPV(RExC_recurse);
6764 r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
6765 Zero(r->substrs, 1, struct reg_substr_data);
6766 if (RExC_study_chunk_recursed)
6767 Zero(RExC_study_chunk_recursed,
6768 RExC_study_chunk_recursed_bytes * RExC_npar, U8);
6770 #ifdef TRIE_STUDY_OPT
6772 StructCopy(&zero_scan_data, &data, scan_data_t);
6773 copyRExC_state = RExC_state;
6776 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
6778 RExC_state = copyRExC_state;
6779 if (seen & REG_TOP_LEVEL_BRANCHES_SEEN)
6780 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
6782 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES_SEEN;
6783 StructCopy(&zero_scan_data, &data, scan_data_t);
6786 StructCopy(&zero_scan_data, &data, scan_data_t);
6789 /* Dig out information for optimizations. */
6790 r->extflags = RExC_flags; /* was pm_op */
6791 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
6794 SvUTF8_on(rx); /* Unicode in it? */
6795 ri->regstclass = NULL;
6796 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
6797 r->intflags |= PREGf_NAUGHTY;
6798 scan = ri->program + 1; /* First BRANCH. */
6800 /* testing for BRANCH here tells us whether there is "must appear"
6801 data in the pattern. If there is then we can use it for optimisations */
6802 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
6805 STRLEN longest_float_length, longest_fixed_length;
6806 regnode_ssc ch_class; /* pointed to by data */
6808 SSize_t last_close = 0; /* pointed to by data */
6809 regnode *first= scan;
6810 regnode *first_next= regnext(first);
6812 * Skip introductions and multiplicators >= 1
6813 * so that we can extract the 'meat' of the pattern that must
6814 * match in the large if() sequence following.
6815 * NOTE that EXACT is NOT covered here, as it is normally
6816 * picked up by the optimiser separately.
6818 * This is unfortunate as the optimiser isnt handling lookahead
6819 * properly currently.
6822 while ((OP(first) == OPEN && (sawopen = 1)) ||
6823 /* An OR of *one* alternative - should not happen now. */
6824 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
6825 /* for now we can't handle lookbehind IFMATCH*/
6826 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
6827 (OP(first) == PLUS) ||
6828 (OP(first) == MINMOD) ||
6829 /* An {n,m} with n>0 */
6830 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
6831 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
6834 * the only op that could be a regnode is PLUS, all the rest
6835 * will be regnode_1 or regnode_2.
6837 * (yves doesn't think this is true)
6839 if (OP(first) == PLUS)
6842 if (OP(first) == MINMOD)
6844 first += regarglen[OP(first)];
6846 first = NEXTOPER(first);
6847 first_next= regnext(first);
6850 /* Starting-point info. */
6852 DEBUG_PEEP("first:",first,0);
6853 /* Ignore EXACT as we deal with it later. */
6854 if (PL_regkind[OP(first)] == EXACT) {
6855 if (OP(first) == EXACT)
6856 NOOP; /* Empty, get anchored substr later. */
6858 ri->regstclass = first;
6861 else if (PL_regkind[OP(first)] == TRIE &&
6862 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
6864 /* this can happen only on restudy */
6865 ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
6868 else if (REGNODE_SIMPLE(OP(first)))
6869 ri->regstclass = first;
6870 else if (PL_regkind[OP(first)] == BOUND ||
6871 PL_regkind[OP(first)] == NBOUND)
6872 ri->regstclass = first;
6873 else if (PL_regkind[OP(first)] == BOL) {
6874 r->intflags |= (OP(first) == MBOL
6877 first = NEXTOPER(first);
6880 else if (OP(first) == GPOS) {
6881 r->intflags |= PREGf_ANCH_GPOS;
6882 first = NEXTOPER(first);
6885 else if ((!sawopen || !RExC_sawback) &&
6887 (OP(first) == STAR &&
6888 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
6889 !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
6891 /* turn .* into ^.* with an implied $*=1 */
6893 (OP(NEXTOPER(first)) == REG_ANY)
6896 r->intflags |= (type | PREGf_IMPLICIT);
6897 first = NEXTOPER(first);
6900 if (sawplus && !sawminmod && !sawlookahead
6901 && (!sawopen || !RExC_sawback)
6902 && !pRExC_state->num_code_blocks) /* May examine pos and $& */
6903 /* x+ must match at the 1st pos of run of x's */
6904 r->intflags |= PREGf_SKIP;
6906 /* Scan is after the zeroth branch, first is atomic matcher. */
6907 #ifdef TRIE_STUDY_OPT
6910 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6911 (IV)(first - scan + 1))
6915 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
6916 (IV)(first - scan + 1))
6922 * If there's something expensive in the r.e., find the
6923 * longest literal string that must appear and make it the
6924 * regmust. Resolve ties in favor of later strings, since
6925 * the regstart check works with the beginning of the r.e.
6926 * and avoiding duplication strengthens checking. Not a
6927 * strong reason, but sufficient in the absence of others.
6928 * [Now we resolve ties in favor of the earlier string if
6929 * it happens that c_offset_min has been invalidated, since the
6930 * earlier string may buy us something the later one won't.]
6933 data.longest_fixed = newSVpvs("");
6934 data.longest_float = newSVpvs("");
6935 data.last_found = newSVpvs("");
6936 data.longest = &(data.longest_fixed);
6937 ENTER_with_name("study_chunk");
6938 SAVEFREESV(data.longest_fixed);
6939 SAVEFREESV(data.longest_float);
6940 SAVEFREESV(data.last_found);
6942 if (!ri->regstclass) {
6943 ssc_init(pRExC_state, &ch_class);
6944 data.start_class = &ch_class;
6945 stclass_flag = SCF_DO_STCLASS_AND;
6946 } else /* XXXX Check for BOUND? */
6948 data.last_closep = &last_close;
6951 minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
6952 scan + RExC_size, /* Up to end */
6954 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag
6955 | (restudied ? SCF_TRIE_DOING_RESTUDY : 0),
6959 CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
6962 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
6963 && data.last_start_min == 0 && data.last_end > 0
6964 && !RExC_seen_zerolen
6965 && !(RExC_seen & REG_VERBARG_SEEN)
6966 && !(RExC_seen & REG_GPOS_SEEN)
6968 r->extflags |= RXf_CHECK_ALL;
6970 scan_commit(pRExC_state, &data,&minlen,0);
6972 longest_float_length = CHR_SVLEN(data.longest_float);
6974 if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
6975 && data.offset_fixed == data.offset_float_min
6976 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
6977 && S_setup_longest (aTHX_ pRExC_state,
6981 &(r->float_end_shift),
6982 data.lookbehind_float,
6983 data.offset_float_min,
6985 longest_float_length,
6986 cBOOL(data.flags & SF_FL_BEFORE_EOL),
6987 cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
6989 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
6990 r->float_max_offset = data.offset_float_max;
6991 if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
6992 r->float_max_offset -= data.lookbehind_float;
6993 SvREFCNT_inc_simple_void_NN(data.longest_float);
6996 r->float_substr = r->float_utf8 = NULL;
6997 longest_float_length = 0;
7000 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
7002 if (S_setup_longest (aTHX_ pRExC_state,
7004 &(r->anchored_utf8),
7005 &(r->anchored_substr),
7006 &(r->anchored_end_shift),
7007 data.lookbehind_fixed,
7010 longest_fixed_length,
7011 cBOOL(data.flags & SF_FIX_BEFORE_EOL),
7012 cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
7014 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
7015 SvREFCNT_inc_simple_void_NN(data.longest_fixed);
7018 r->anchored_substr = r->anchored_utf8 = NULL;
7019 longest_fixed_length = 0;
7021 LEAVE_with_name("study_chunk");
7024 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
7025 ri->regstclass = NULL;
7027 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
7029 && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7030 && !ssc_is_anything(data.start_class))
7032 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7034 ssc_finalize(pRExC_state, data.start_class);
7036 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7037 StructCopy(data.start_class,
7038 (regnode_ssc*)RExC_rxi->data->data[n],
7040 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7041 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7042 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
7043 regprop(r, sv, (regnode*)data.start_class, NULL);
7044 PerlIO_printf(Perl_debug_log,
7045 "synthetic stclass \"%s\".\n",
7046 SvPVX_const(sv));});
7047 data.start_class = NULL;
7050 /* A temporary algorithm prefers floated substr to fixed one to dig
7052 if (longest_fixed_length > longest_float_length) {
7053 r->substrs->check_ix = 0;
7054 r->check_end_shift = r->anchored_end_shift;
7055 r->check_substr = r->anchored_substr;
7056 r->check_utf8 = r->anchored_utf8;
7057 r->check_offset_min = r->check_offset_max = r->anchored_offset;
7058 if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
7059 r->intflags |= PREGf_NOSCAN;
7062 r->substrs->check_ix = 1;
7063 r->check_end_shift = r->float_end_shift;
7064 r->check_substr = r->float_substr;
7065 r->check_utf8 = r->float_utf8;
7066 r->check_offset_min = r->float_min_offset;
7067 r->check_offset_max = r->float_max_offset;
7069 if ((r->check_substr || r->check_utf8) ) {
7070 r->extflags |= RXf_USE_INTUIT;
7071 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
7072 r->extflags |= RXf_INTUIT_TAIL;
7074 r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
7076 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
7077 if ( (STRLEN)minlen < longest_float_length )
7078 minlen= longest_float_length;
7079 if ( (STRLEN)minlen < longest_fixed_length )
7080 minlen= longest_fixed_length;
7084 /* Several toplevels. Best we can is to set minlen. */
7086 regnode_ssc ch_class;
7087 SSize_t last_close = 0;
7089 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
7091 scan = ri->program + 1;
7092 ssc_init(pRExC_state, &ch_class);
7093 data.start_class = &ch_class;
7094 data.last_closep = &last_close;
7097 minlen = study_chunk(pRExC_state,
7098 &scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
7099 SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
7100 ? SCF_TRIE_DOING_RESTUDY
7104 CHECK_RESTUDY_GOTO_butfirst(NOOP);
7106 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
7107 = r->float_substr = r->float_utf8 = NULL;
7109 if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
7110 && ! ssc_is_anything(data.start_class))
7112 const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
7114 ssc_finalize(pRExC_state, data.start_class);
7116 Newx(RExC_rxi->data->data[n], 1, regnode_ssc);
7117 StructCopy(data.start_class,
7118 (regnode_ssc*)RExC_rxi->data->data[n],
7120 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
7121 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
7122 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
7123 regprop(r, sv, (regnode*)data.start_class, NULL);
7124 PerlIO_printf(Perl_debug_log,
7125 "synthetic stclass \"%s\".\n",
7126 SvPVX_const(sv));});
7127 data.start_class = NULL;
7131 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
7132 r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
7133 r->maxlen = REG_INFTY;
7136 r->maxlen = RExC_maxlen;
7139 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
7140 the "real" pattern. */
7142 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n",
7143 (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
7145 r->minlenret = minlen;
7146 if (r->minlen < minlen)
7149 if (RExC_seen & REG_GPOS_SEEN)
7150 r->intflags |= PREGf_GPOS_SEEN;
7151 if (RExC_seen & REG_LOOKBEHIND_SEEN)
7152 r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
7154 if (pRExC_state->num_code_blocks)
7155 r->extflags |= RXf_EVAL_SEEN;
7156 if (RExC_seen & REG_CANY_SEEN)
7157 r->intflags |= PREGf_CANY_SEEN;
7158 if (RExC_seen & REG_VERBARG_SEEN)
7160 r->intflags |= PREGf_VERBARG_SEEN;
7161 r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
7163 if (RExC_seen & REG_CUTGROUP_SEEN)
7164 r->intflags |= PREGf_CUTGROUP_SEEN;
7165 if (pm_flags & PMf_USE_RE_EVAL)
7166 r->intflags |= PREGf_USE_RE_EVAL;
7167 if (RExC_paren_names)
7168 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
7170 RXp_PAREN_NAMES(r) = NULL;
7172 /* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
7173 * so it can be used in pp.c */
7174 if (r->intflags & PREGf_ANCH)
7175 r->extflags |= RXf_IS_ANCHORED;
7179 /* this is used to identify "special" patterns that might result
7180 * in Perl NOT calling the regex engine and instead doing the match "itself",
7181 * particularly special cases in split//. By having the regex compiler
7182 * do this pattern matching at a regop level (instead of by inspecting the pattern)
7183 * we avoid weird issues with equivalent patterns resulting in different behavior,
7184 * AND we allow non Perl engines to get the same optimizations by the setting the
7185 * flags appropriately - Yves */
7186 regnode *first = ri->program + 1;
7188 regnode *next = NEXTOPER(first);
7191 if (PL_regkind[fop] == NOTHING && nop == END)
7192 r->extflags |= RXf_NULL;
7193 else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
7194 /* when fop is SBOL first->flags will be true only when it was
7195 * produced by parsing /\A/, and not when parsing /^/. This is
7196 * very important for the split code as there we want to
7197 * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
7198 * See rt #122761 for more details. -- Yves */
7199 r->extflags |= RXf_START_ONLY;
7200 else if (fop == PLUS
7201 && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
7202 && OP(regnext(first)) == END)
7203 r->extflags |= RXf_WHITE;
7204 else if ( r->extflags & RXf_SPLIT
7206 && STR_LEN(first) == 1
7207 && *(STRING(first)) == ' '
7208 && OP(regnext(first)) == END )
7209 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
7213 if (RExC_contains_locale) {
7214 RXp_EXTFLAGS(r) |= RXf_TAINTED;
7218 if (RExC_paren_names) {
7219 ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
7220 ri->data->data[ri->name_list_idx]
7221 = (void*)SvREFCNT_inc(RExC_paren_name_list);
7224 ri->name_list_idx = 0;
7226 if (RExC_recurse_count) {
7227 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
7228 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
7229 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
7232 Newxz(r->offs, RExC_npar, regexp_paren_pair);
7233 /* assume we don't need to swap parens around before we match */
7237 PerlIO_printf(Perl_debug_log,"Final program:\n");
7240 #ifdef RE_TRACK_PATTERN_OFFSETS
7241 DEBUG_OFFSETS_r(if (ri->u.offsets) {
7242 const STRLEN len = ri->u.offsets[0];
7244 GET_RE_DEBUG_FLAGS_DECL;
7245 PerlIO_printf(Perl_debug_log,
7246 "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
7247 for (i = 1; i <= len; i++) {
7248 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
7249 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7250 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
7252 PerlIO_printf(Perl_debug_log, "\n");
7257 /* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
7258 * by setting the regexp SV to readonly-only instead. If the
7259 * pattern's been recompiled, the USEDness should remain. */
7260 if (old_re && SvREADONLY(old_re))
7268 Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
7271 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
7273 PERL_UNUSED_ARG(value);
7275 if (flags & RXapif_FETCH) {
7276 return reg_named_buff_fetch(rx, key, flags);
7277 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
7278 Perl_croak_no_modify();
7280 } else if (flags & RXapif_EXISTS) {
7281 return reg_named_buff_exists(rx, key, flags)
7284 } else if (flags & RXapif_REGNAMES) {
7285 return reg_named_buff_all(rx, flags);
7286 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
7287 return reg_named_buff_scalar(rx, flags);
7289 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
7295 Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
7298 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
7299 PERL_UNUSED_ARG(lastkey);
7301 if (flags & RXapif_FIRSTKEY)
7302 return reg_named_buff_firstkey(rx, flags);
7303 else if (flags & RXapif_NEXTKEY)
7304 return reg_named_buff_nextkey(rx, flags);
7306 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter",
7313 Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
7316 AV *retarray = NULL;
7318 struct regexp *const rx = ReANY(r);
7320 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
7322 if (flags & RXapif_ALL)
7325 if (rx && RXp_PAREN_NAMES(rx)) {
7326 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
7329 SV* sv_dat=HeVAL(he_str);
7330 I32 *nums=(I32*)SvPVX(sv_dat);
7331 for ( i=0; i<SvIVX(sv_dat); i++ ) {
7332 if ((I32)(rx->nparens) >= nums[i]
7333 && rx->offs[nums[i]].start != -1
7334 && rx->offs[nums[i]].end != -1)
7337 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
7342 ret = newSVsv(&PL_sv_undef);
7345 av_push(retarray, ret);
7348 return newRV_noinc(MUTABLE_SV(retarray));
7355 Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
7358 struct regexp *const rx = ReANY(r);
7360 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
7362 if (rx && RXp_PAREN_NAMES(rx)) {
7363 if (flags & RXapif_ALL) {
7364 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
7366 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
7368 SvREFCNT_dec_NN(sv);
7380 Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
7382 struct regexp *const rx = ReANY(r);
7384 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
7386 if ( rx && RXp_PAREN_NAMES(rx) ) {
7387 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
7389 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
7396 Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
7398 struct regexp *const rx = ReANY(r);
7399 GET_RE_DEBUG_FLAGS_DECL;
7401 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
7403 if (rx && RXp_PAREN_NAMES(rx)) {
7404 HV *hv = RXp_PAREN_NAMES(rx);
7406 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7409 SV* sv_dat = HeVAL(temphe);
7410 I32 *nums = (I32*)SvPVX(sv_dat);
7411 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7412 if ((I32)(rx->lastparen) >= nums[i] &&
7413 rx->offs[nums[i]].start != -1 &&
7414 rx->offs[nums[i]].end != -1)
7420 if (parno || flags & RXapif_ALL) {
7421 return newSVhek(HeKEY_hek(temphe));
7429 Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
7434 struct regexp *const rx = ReANY(r);
7436 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
7438 if (rx && RXp_PAREN_NAMES(rx)) {
7439 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
7440 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
7441 } else if (flags & RXapif_ONE) {
7442 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
7443 av = MUTABLE_AV(SvRV(ret));
7444 length = av_tindex(av);
7445 SvREFCNT_dec_NN(ret);
7446 return newSViv(length + 1);
7448 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar",
7453 return &PL_sv_undef;
7457 Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
7459 struct regexp *const rx = ReANY(r);
7462 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
7464 if (rx && RXp_PAREN_NAMES(rx)) {
7465 HV *hv= RXp_PAREN_NAMES(rx);
7467 (void)hv_iterinit(hv);
7468 while ( (temphe = hv_iternext_flags(hv,0)) ) {
7471 SV* sv_dat = HeVAL(temphe);
7472 I32 *nums = (I32*)SvPVX(sv_dat);
7473 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
7474 if ((I32)(rx->lastparen) >= nums[i] &&
7475 rx->offs[nums[i]].start != -1 &&
7476 rx->offs[nums[i]].end != -1)
7482 if (parno || flags & RXapif_ALL) {
7483 av_push(av, newSVhek(HeKEY_hek(temphe)));
7488 return newRV_noinc(MUTABLE_SV(av));
7492 Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
7495 struct regexp *const rx = ReANY(r);
7501 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
7503 if ( n == RX_BUFF_IDX_CARET_PREMATCH
7504 || n == RX_BUFF_IDX_CARET_FULLMATCH
7505 || n == RX_BUFF_IDX_CARET_POSTMATCH
7508 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7510 /* on something like
7513 * the KEEPCOPY is set on the PMOP rather than the regex */
7514 if (PL_curpm && r == PM_GETRE(PL_curpm))
7515 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7524 if (n == RX_BUFF_IDX_CARET_FULLMATCH)
7525 /* no need to distinguish between them any more */
7526 n = RX_BUFF_IDX_FULLMATCH;
7528 if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
7529 && rx->offs[0].start != -1)
7531 /* $`, ${^PREMATCH} */
7532 i = rx->offs[0].start;
7536 if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
7537 && rx->offs[0].end != -1)
7539 /* $', ${^POSTMATCH} */
7540 s = rx->subbeg - rx->suboffset + rx->offs[0].end;
7541 i = rx->sublen + rx->suboffset - rx->offs[0].end;
7544 if ( 0 <= n && n <= (I32)rx->nparens &&
7545 (s1 = rx->offs[n].start) != -1 &&
7546 (t1 = rx->offs[n].end) != -1)
7548 /* $&, ${^MATCH}, $1 ... */
7550 s = rx->subbeg + s1 - rx->suboffset;
7555 assert(s >= rx->subbeg);
7556 assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
7558 #ifdef NO_TAINT_SUPPORT
7559 sv_setpvn(sv, s, i);
7561 const int oldtainted = TAINT_get;
7563 sv_setpvn(sv, s, i);
7564 TAINT_set(oldtainted);
7566 if ( (rx->intflags & PREGf_CANY_SEEN)
7567 ? (RXp_MATCH_UTF8(rx)
7568 && (!i || is_utf8_string((U8*)s, i)))
7569 : (RXp_MATCH_UTF8(rx)) )
7576 if (RXp_MATCH_TAINTED(rx)) {
7577 if (SvTYPE(sv) >= SVt_PVMG) {
7578 MAGIC* const mg = SvMAGIC(sv);
7581 SvMAGIC_set(sv, mg->mg_moremagic);
7583 if ((mgt = SvMAGIC(sv))) {
7584 mg->mg_moremagic = mgt;
7585 SvMAGIC_set(sv, mg);
7596 sv_setsv(sv,&PL_sv_undef);
7602 Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
7603 SV const * const value)
7605 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
7607 PERL_UNUSED_ARG(rx);
7608 PERL_UNUSED_ARG(paren);
7609 PERL_UNUSED_ARG(value);
7612 Perl_croak_no_modify();
7616 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
7619 struct regexp *const rx = ReANY(r);
7623 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
7625 if ( paren == RX_BUFF_IDX_CARET_PREMATCH
7626 || paren == RX_BUFF_IDX_CARET_FULLMATCH
7627 || paren == RX_BUFF_IDX_CARET_POSTMATCH
7630 bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
7632 /* on something like
7635 * the KEEPCOPY is set on the PMOP rather than the regex */
7636 if (PL_curpm && r == PM_GETRE(PL_curpm))
7637 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
7643 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
7645 case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
7646 case RX_BUFF_IDX_PREMATCH: /* $` */
7647 if (rx->offs[0].start != -1) {
7648 i = rx->offs[0].start;
7657 case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
7658 case RX_BUFF_IDX_POSTMATCH: /* $' */
7659 if (rx->offs[0].end != -1) {
7660 i = rx->sublen - rx->offs[0].end;
7662 s1 = rx->offs[0].end;
7669 default: /* $& / ${^MATCH}, $1, $2, ... */
7670 if (paren <= (I32)rx->nparens &&
7671 (s1 = rx->offs[paren].start) != -1 &&
7672 (t1 = rx->offs[paren].end) != -1)
7678 if (ckWARN(WARN_UNINITIALIZED))
7679 report_uninit((const SV *)sv);
7684 if (i > 0 && RXp_MATCH_UTF8(rx)) {
7685 const char * const s = rx->subbeg - rx->suboffset + s1;
7690 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
7697 Perl_reg_qr_package(pTHX_ REGEXP * const rx)
7699 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
7700 PERL_UNUSED_ARG(rx);
7704 return newSVpvs("Regexp");
7707 /* Scans the name of a named buffer from the pattern.
7708 * If flags is REG_RSN_RETURN_NULL returns null.
7709 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
7710 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
7711 * to the parsed name as looked up in the RExC_paren_names hash.
7712 * If there is an error throws a vFAIL().. type exception.
7715 #define REG_RSN_RETURN_NULL 0
7716 #define REG_RSN_RETURN_NAME 1
7717 #define REG_RSN_RETURN_DATA 2
7720 S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
7722 char *name_start = RExC_parse;
7724 PERL_ARGS_ASSERT_REG_SCAN_NAME;
7726 assert (RExC_parse <= RExC_end);
7727 if (RExC_parse == RExC_end) NOOP;
7728 else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
7729 /* skip IDFIRST by using do...while */
7732 RExC_parse += UTF8SKIP(RExC_parse);
7733 } while (isWORDCHAR_utf8((U8*)RExC_parse));
7737 } while (isWORDCHAR(*RExC_parse));
7739 RExC_parse++; /* so the <- from the vFAIL is after the offending
7741 vFAIL("Group name must start with a non-digit word character");
7745 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
7746 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
7747 if ( flags == REG_RSN_RETURN_NAME)
7749 else if (flags==REG_RSN_RETURN_DATA) {
7752 if ( ! sv_name ) /* should not happen*/
7753 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
7754 if (RExC_paren_names)
7755 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
7757 sv_dat = HeVAL(he_str);
7759 vFAIL("Reference to nonexistent named group");
7763 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
7764 (unsigned long) flags);
7766 assert(0); /* NOT REACHED */
7771 #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
7772 int rem=(int)(RExC_end - RExC_parse); \
7781 if (RExC_lastparse!=RExC_parse) \
7782 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
7785 iscut ? "..." : "<" \
7788 PerlIO_printf(Perl_debug_log,"%16s",""); \
7791 num = RExC_size + 1; \
7793 num=REG_NODE_NUM(RExC_emit); \
7794 if (RExC_lastnum!=num) \
7795 PerlIO_printf(Perl_debug_log,"|%4d",num); \
7797 PerlIO_printf(Perl_debug_log,"|%4s",""); \
7798 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
7799 (int)((depth*2)), "", \
7803 RExC_lastparse=RExC_parse; \
7808 #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
7809 DEBUG_PARSE_MSG((funcname)); \
7810 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
7812 #define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
7813 DEBUG_PARSE_MSG((funcname)); \
7814 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
7817 /* This section of code defines the inversion list object and its methods. The
7818 * interfaces are highly subject to change, so as much as possible is static to
7819 * this file. An inversion list is here implemented as a malloc'd C UV array
7820 * as an SVt_INVLIST scalar.
7822 * An inversion list for Unicode is an array of code points, sorted by ordinal
7823 * number. The zeroth element is the first code point in the list. The 1th
7824 * element is the first element beyond that not in the list. In other words,
7825 * the first range is
7826 * invlist[0]..(invlist[1]-1)
7827 * The other ranges follow. Thus every element whose index is divisible by two
7828 * marks the beginning of a range that is in the list, and every element not
7829 * divisible by two marks the beginning of a range not in the list. A single
7830 * element inversion list that contains the single code point N generally
7831 * consists of two elements
7834 * (The exception is when N is the highest representable value on the
7835 * machine, in which case the list containing just it would be a single
7836 * element, itself. By extension, if the last range in the list extends to
7837 * infinity, then the first element of that range will be in the inversion list
7838 * at a position that is divisible by two, and is the final element in the
7840 * Taking the complement (inverting) an inversion list is quite simple, if the
7841 * first element is 0, remove it; otherwise add a 0 element at the beginning.
7842 * This implementation reserves an element at the beginning of each inversion
7843 * list to always contain 0; there is an additional flag in the header which
7844 * indicates if the list begins at the 0, or is offset to begin at the next
7847 * More about inversion lists can be found in "Unicode Demystified"
7848 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
7849 * More will be coming when functionality is added later.
7851 * The inversion list data structure is currently implemented as an SV pointing
7852 * to an array of UVs that the SV thinks are bytes. This allows us to have an
7853 * array of UV whose memory management is automatically handled by the existing
7854 * facilities for SV's.
7856 * Some of the methods should always be private to the implementation, and some
7857 * should eventually be made public */
7859 /* The header definitions are in F<inline_invlist.c> */
7861 PERL_STATIC_INLINE UV*
7862 S__invlist_array_init(SV* const invlist, const bool will_have_0)
7864 /* Returns a pointer to the first element in the inversion list's array.
7865 * This is called upon initialization of an inversion list. Where the
7866 * array begins depends on whether the list has the code point U+0000 in it
7867 * or not. The other parameter tells it whether the code that follows this
7868 * call is about to put a 0 in the inversion list or not. The first
7869 * element is either the element reserved for 0, if TRUE, or the element
7870 * after it, if FALSE */
7872 bool* offset = get_invlist_offset_addr(invlist);
7873 UV* zero_addr = (UV *) SvPVX(invlist);
7875 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
7878 assert(! _invlist_len(invlist));
7882 /* 1^1 = 0; 1^0 = 1 */
7883 *offset = 1 ^ will_have_0;
7884 return zero_addr + *offset;
7887 PERL_STATIC_INLINE UV*
7888 S_invlist_array(SV* const invlist)
7890 /* Returns the pointer to the inversion list's array. Every time the
7891 * length changes, this needs to be called in case malloc or realloc moved
7894 PERL_ARGS_ASSERT_INVLIST_ARRAY;
7896 /* Must not be empty. If these fail, you probably didn't check for <len>
7897 * being non-zero before trying to get the array */
7898 assert(_invlist_len(invlist));
7900 /* The very first element always contains zero, The array begins either
7901 * there, or if the inversion list is offset, at the element after it.
7902 * The offset header field determines which; it contains 0 or 1 to indicate
7903 * how much additionally to add */
7904 assert(0 == *(SvPVX(invlist)));
7905 return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist));
7908 PERL_STATIC_INLINE void
7909 S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset)
7911 /* Sets the current number of elements stored in the inversion list.
7912 * Updates SvCUR correspondingly */
7913 PERL_UNUSED_CONTEXT;
7914 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
7916 assert(SvTYPE(invlist) == SVt_INVLIST);
7921 : TO_INTERNAL_SIZE(len + offset));
7922 assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
7925 PERL_STATIC_INLINE IV*
7926 S_get_invlist_previous_index_addr(SV* invlist)
7928 /* Return the address of the IV that is reserved to hold the cached index
7930 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
7932 assert(SvTYPE(invlist) == SVt_INVLIST);
7934 return &(((XINVLIST*) SvANY(invlist))->prev_index);
7937 PERL_STATIC_INLINE IV
7938 S_invlist_previous_index(SV* const invlist)
7940 /* Returns cached index of previous search */
7942 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
7944 return *get_invlist_previous_index_addr(invlist);
7947 PERL_STATIC_INLINE void
7948 S_invlist_set_previous_index(SV* const invlist, const IV index)
7950 /* Caches <index> for later retrieval */
7952 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
7954 assert(index == 0 || index < (int) _invlist_len(invlist));
7956 *get_invlist_previous_index_addr(invlist) = index;
7959 PERL_STATIC_INLINE UV
7960 S_invlist_max(SV* const invlist)
7962 /* Returns the maximum number of elements storable in the inversion list's
7963 * array, without having to realloc() */
7965 PERL_ARGS_ASSERT_INVLIST_MAX;
7967 assert(SvTYPE(invlist) == SVt_INVLIST);
7969 /* Assumes worst case, in which the 0 element is not counted in the
7970 * inversion list, so subtracts 1 for that */
7971 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
7972 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
7973 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
7976 #ifndef PERL_IN_XSUB_RE
7978 Perl__new_invlist(pTHX_ IV initial_size)
7981 /* Return a pointer to a newly constructed inversion list, with enough
7982 * space to store 'initial_size' elements. If that number is negative, a
7983 * system default is used instead */
7987 if (initial_size < 0) {
7991 /* Allocate the initial space */
7992 new_list = newSV_type(SVt_INVLIST);
7994 /* First 1 is in case the zero element isn't in the list; second 1 is for
7996 SvGROW(new_list, TO_INTERNAL_SIZE(initial_size + 1) + 1);
7997 invlist_set_len(new_list, 0, 0);
7999 /* Force iterinit() to be used to get iteration to work */
8000 *get_invlist_iter_addr(new_list) = (STRLEN) UV_MAX;
8002 *get_invlist_previous_index_addr(new_list) = 0;
8008 Perl__new_invlist_C_array(pTHX_ const UV* const list)
8010 /* Return a pointer to a newly constructed inversion list, initialized to
8011 * point to <list>, which has to be in the exact correct inversion list
8012 * form, including internal fields. Thus this is a dangerous routine that
8013 * should not be used in the wrong hands. The passed in 'list' contains
8014 * several header fields at the beginning that are not part of the
8015 * inversion list body proper */
8017 const STRLEN length = (STRLEN) list[0];
8018 const UV version_id = list[1];
8019 const bool offset = cBOOL(list[2]);
8020 #define HEADER_LENGTH 3
8021 /* If any of the above changes in any way, you must change HEADER_LENGTH
8022 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
8023 * perl -E 'say int(rand 2**31-1)'
8025 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
8026 data structure type, so that one being
8027 passed in can be validated to be an
8028 inversion list of the correct vintage.
8031 SV* invlist = newSV_type(SVt_INVLIST);
8033 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
8035 if (version_id != INVLIST_VERSION_ID) {
8036 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
8039 /* The generated array passed in includes header elements that aren't part
8040 * of the list proper, so start it just after them */
8041 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
8043 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
8044 shouldn't touch it */
8046 *(get_invlist_offset_addr(invlist)) = offset;
8048 /* The 'length' passed to us is the physical number of elements in the
8049 * inversion list. But if there is an offset the logical number is one
8051 invlist_set_len(invlist, length - offset, offset);
8053 invlist_set_previous_index(invlist, 0);
8055 /* Initialize the iteration pointer. */
8056 invlist_iterfinish(invlist);
8058 SvREADONLY_on(invlist);
8062 #endif /* ifndef PERL_IN_XSUB_RE */
8065 S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
8067 /* Grow the maximum size of an inversion list */
8069 PERL_ARGS_ASSERT_INVLIST_EXTEND;
8071 assert(SvTYPE(invlist) == SVt_INVLIST);
8073 /* Add one to account for the zero element at the beginning which may not
8074 * be counted by the calling parameters */
8075 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
8078 PERL_STATIC_INLINE void
8079 S_invlist_trim(SV* const invlist)
8081 PERL_ARGS_ASSERT_INVLIST_TRIM;
8083 assert(SvTYPE(invlist) == SVt_INVLIST);
8085 /* Change the length of the inversion list to how many entries it currently
8087 SvPV_shrink_to_cur((SV *) invlist);
8091 S__append_range_to_invlist(pTHX_ SV* const invlist,
8092 const UV start, const UV end)
8094 /* Subject to change or removal. Append the range from 'start' to 'end' at
8095 * the end of the inversion list. The range must be above any existing
8099 UV max = invlist_max(invlist);
8100 UV len = _invlist_len(invlist);
8103 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
8105 if (len == 0) { /* Empty lists must be initialized */
8106 offset = start != 0;
8107 array = _invlist_array_init(invlist, ! offset);
8110 /* Here, the existing list is non-empty. The current max entry in the
8111 * list is generally the first value not in the set, except when the
8112 * set extends to the end of permissible values, in which case it is
8113 * the first entry in that final set, and so this call is an attempt to
8114 * append out-of-order */
8116 UV final_element = len - 1;
8117 array = invlist_array(invlist);
8118 if (array[final_element] > start
8119 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
8121 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
8122 array[final_element], start,
8123 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
8126 /* Here, it is a legal append. If the new range begins with the first
8127 * value not in the set, it is extending the set, so the new first
8128 * value not in the set is one greater than the newly extended range.
8130 offset = *get_invlist_offset_addr(invlist);
8131 if (array[final_element] == start) {
8132 if (end != UV_MAX) {
8133 array[final_element] = end + 1;
8136 /* But if the end is the maximum representable on the machine,
8137 * just let the range that this would extend to have no end */
8138 invlist_set_len(invlist, len - 1, offset);
8144 /* Here the new range doesn't extend any existing set. Add it */
8146 len += 2; /* Includes an element each for the start and end of range */
8148 /* If wll overflow the existing space, extend, which may cause the array to
8151 invlist_extend(invlist, len);
8153 /* Have to set len here to avoid assert failure in invlist_array() */
8154 invlist_set_len(invlist, len, offset);
8156 array = invlist_array(invlist);
8159 invlist_set_len(invlist, len, offset);
8162 /* The next item on the list starts the range, the one after that is
8163 * one past the new range. */
8164 array[len - 2] = start;
8165 if (end != UV_MAX) {
8166 array[len - 1] = end + 1;
8169 /* But if the end is the maximum representable on the machine, just let
8170 * the range have no end */
8171 invlist_set_len(invlist, len - 1, offset);
8175 #ifndef PERL_IN_XSUB_RE
8178 Perl__invlist_search(SV* const invlist, const UV cp)
8180 /* Searches the inversion list for the entry that contains the input code
8181 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
8182 * return value is the index into the list's array of the range that
8187 IV high = _invlist_len(invlist);
8188 const IV highest_element = high - 1;
8191 PERL_ARGS_ASSERT__INVLIST_SEARCH;
8193 /* If list is empty, return failure. */
8198 /* (We can't get the array unless we know the list is non-empty) */
8199 array = invlist_array(invlist);
8201 mid = invlist_previous_index(invlist);
8202 assert(mid >=0 && mid <= highest_element);
8204 /* <mid> contains the cache of the result of the previous call to this
8205 * function (0 the first time). See if this call is for the same result,
8206 * or if it is for mid-1. This is under the theory that calls to this
8207 * function will often be for related code points that are near each other.
8208 * And benchmarks show that caching gives better results. We also test
8209 * here if the code point is within the bounds of the list. These tests
8210 * replace others that would have had to be made anyway to make sure that
8211 * the array bounds were not exceeded, and these give us extra information
8212 * at the same time */
8213 if (cp >= array[mid]) {
8214 if (cp >= array[highest_element]) {
8215 return highest_element;
8218 /* Here, array[mid] <= cp < array[highest_element]. This means that
8219 * the final element is not the answer, so can exclude it; it also
8220 * means that <mid> is not the final element, so can refer to 'mid + 1'
8222 if (cp < array[mid + 1]) {
8228 else { /* cp < aray[mid] */
8229 if (cp < array[0]) { /* Fail if outside the array */
8233 if (cp >= array[mid - 1]) {
8238 /* Binary search. What we are looking for is <i> such that
8239 * array[i] <= cp < array[i+1]
8240 * The loop below converges on the i+1. Note that there may not be an
8241 * (i+1)th element in the array, and things work nonetheless */
8242 while (low < high) {
8243 mid = (low + high) / 2;
8244 assert(mid <= highest_element);
8245 if (array[mid] <= cp) { /* cp >= array[mid] */
8248 /* We could do this extra test to exit the loop early.
8249 if (cp < array[low]) {
8254 else { /* cp < array[mid] */
8261 invlist_set_previous_index(invlist, high);
8266 Perl__invlist_populate_swatch(SV* const invlist,
8267 const UV start, const UV end, U8* swatch)
8269 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
8270 * but is used when the swash has an inversion list. This makes this much
8271 * faster, as it uses a binary search instead of a linear one. This is
8272 * intimately tied to that function, and perhaps should be in utf8.c,
8273 * except it is intimately tied to inversion lists as well. It assumes
8274 * that <swatch> is all 0's on input */
8277 const IV len = _invlist_len(invlist);
8281 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
8283 if (len == 0) { /* Empty inversion list */
8287 array = invlist_array(invlist);
8289 /* Find which element it is */
8290 i = _invlist_search(invlist, start);
8292 /* We populate from <start> to <end> */
8293 while (current < end) {
8296 /* The inversion list gives the results for every possible code point
8297 * after the first one in the list. Only those ranges whose index is
8298 * even are ones that the inversion list matches. For the odd ones,
8299 * and if the initial code point is not in the list, we have to skip
8300 * forward to the next element */
8301 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
8303 if (i >= len) { /* Finished if beyond the end of the array */
8307 if (current >= end) { /* Finished if beyond the end of what we
8309 if (LIKELY(end < UV_MAX)) {
8313 /* We get here when the upper bound is the maximum
8314 * representable on the machine, and we are looking for just
8315 * that code point. Have to special case it */
8317 goto join_end_of_list;
8320 assert(current >= start);
8322 /* The current range ends one below the next one, except don't go past
8325 upper = (i < len && array[i] < end) ? array[i] : end;
8327 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
8328 * for each code point in it */
8329 for (; current < upper; current++) {
8330 const STRLEN offset = (STRLEN)(current - start);
8331 swatch[offset >> 3] |= 1 << (offset & 7);
8336 /* Quit if at the end of the list */
8339 /* But first, have to deal with the highest possible code point on
8340 * the platform. The previous code assumes that <end> is one
8341 * beyond where we want to populate, but that is impossible at the
8342 * platform's infinity, so have to handle it specially */
8343 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
8345 const STRLEN offset = (STRLEN)(end - start);
8346 swatch[offset >> 3] |= 1 << (offset & 7);
8351 /* Advance to the next range, which will be for code points not in the
8360 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8361 const bool complement_b, SV** output)
8363 /* Take the union of two inversion lists and point <output> to it. *output
8364 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8365 * the reference count to that list will be decremented if not already a
8366 * temporary (mortal); otherwise *output will be made correspondingly
8367 * mortal. The first list, <a>, may be NULL, in which case a copy of the
8368 * second list is returned. If <complement_b> is TRUE, the union is taken
8369 * of the complement (inversion) of <b> instead of b itself.
8371 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8372 * Richard Gillam, published by Addison-Wesley, and explained at some
8373 * length there. The preface says to incorporate its examples into your
8374 * code at your own risk.
8376 * The algorithm is like a merge sort.
8378 * XXX A potential performance improvement is to keep track as we go along
8379 * if only one of the inputs contributes to the result, meaning the other
8380 * is a subset of that one. In that case, we can skip the final copy and
8381 * return the larger of the input lists, but then outside code might need
8382 * to keep track of whether to free the input list or not */
8384 const UV* array_a; /* a's array */
8386 UV len_a; /* length of a's array */
8389 SV* u; /* the resulting union */
8393 UV i_a = 0; /* current index into a's array */
8397 /* running count, as explained in the algorithm source book; items are
8398 * stopped accumulating and are output when the count changes to/from 0.
8399 * The count is incremented when we start a range that's in the set, and
8400 * decremented when we start a range that's not in the set. So its range
8401 * is 0 to 2. Only when the count is zero is something not in the set.
8405 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
8408 /* If either one is empty, the union is the other one */
8409 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
8410 bool make_temp = FALSE; /* Should we mortalize the result? */
8414 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8420 *output = invlist_clone(b);
8422 _invlist_invert(*output);
8424 } /* else *output already = b; */
8427 sv_2mortal(*output);
8431 else if ((len_b = _invlist_len(b)) == 0) {
8432 bool make_temp = FALSE;
8434 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8439 /* The complement of an empty list is a list that has everything in it,
8440 * so the union with <a> includes everything too */
8443 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8447 *output = _new_invlist(1);
8448 _append_range_to_invlist(*output, 0, UV_MAX);
8450 else if (*output != a) {
8451 *output = invlist_clone(a);
8453 /* else *output already = a; */
8456 sv_2mortal(*output);
8461 /* Here both lists exist and are non-empty */
8462 array_a = invlist_array(a);
8463 array_b = invlist_array(b);
8465 /* If are to take the union of 'a' with the complement of b, set it
8466 * up so are looking at b's complement. */
8469 /* To complement, we invert: if the first element is 0, remove it. To
8470 * do this, we just pretend the array starts one later */
8471 if (array_b[0] == 0) {
8477 /* But if the first element is not zero, we pretend the list starts
8478 * at the 0 that is always stored immediately before the array. */
8484 /* Size the union for the worst case: that the sets are completely
8486 u = _new_invlist(len_a + len_b);
8488 /* Will contain U+0000 if either component does */
8489 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
8490 || (len_b > 0 && array_b[0] == 0));
8492 /* Go through each list item by item, stopping when exhausted one of
8494 while (i_a < len_a && i_b < len_b) {
8495 UV cp; /* The element to potentially add to the union's array */
8496 bool cp_in_set; /* is it in the the input list's set or not */
8498 /* We need to take one or the other of the two inputs for the union.
8499 * Since we are merging two sorted lists, we take the smaller of the
8500 * next items. In case of a tie, we take the one that is in its set
8501 * first. If we took one not in the set first, it would decrement the
8502 * count, possibly to 0 which would cause it to be output as ending the
8503 * range, and the next time through we would take the same number, and
8504 * output it again as beginning the next range. By doing it the
8505 * opposite way, there is no possibility that the count will be
8506 * momentarily decremented to 0, and thus the two adjoining ranges will
8507 * be seamlessly merged. (In a tie and both are in the set or both not
8508 * in the set, it doesn't matter which we take first.) */
8509 if (array_a[i_a] < array_b[i_b]
8510 || (array_a[i_a] == array_b[i_b]
8511 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8513 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8517 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8518 cp = array_b[i_b++];
8521 /* Here, have chosen which of the two inputs to look at. Only output
8522 * if the running count changes to/from 0, which marks the
8523 * beginning/end of a range in that's in the set */
8526 array_u[i_u++] = cp;
8533 array_u[i_u++] = cp;
8538 /* Here, we are finished going through at least one of the lists, which
8539 * means there is something remaining in at most one. We check if the list
8540 * that hasn't been exhausted is positioned such that we are in the middle
8541 * of a range in its set or not. (i_a and i_b point to the element beyond
8542 * the one we care about.) If in the set, we decrement 'count'; if 0, there
8543 * is potentially more to output.
8544 * There are four cases:
8545 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
8546 * in the union is entirely from the non-exhausted set.
8547 * 2) Both were in their sets, count is 2. Nothing further should
8548 * be output, as everything that remains will be in the exhausted
8549 * list's set, hence in the union; decrementing to 1 but not 0 insures
8551 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
8552 * Nothing further should be output because the union includes
8553 * everything from the exhausted set. Not decrementing ensures that.
8554 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
8555 * decrementing to 0 insures that we look at the remainder of the
8556 * non-exhausted set */
8557 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8558 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8563 /* The final length is what we've output so far, plus what else is about to
8564 * be output. (If 'count' is non-zero, then the input list we exhausted
8565 * has everything remaining up to the machine's limit in its set, and hence
8566 * in the union, so there will be no further output. */
8569 /* At most one of the subexpressions will be non-zero */
8570 len_u += (len_a - i_a) + (len_b - i_b);
8573 /* Set result to final length, which can change the pointer to array_u, so
8575 if (len_u != _invlist_len(u)) {
8576 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
8578 array_u = invlist_array(u);
8581 /* When 'count' is 0, the list that was exhausted (if one was shorter than
8582 * the other) ended with everything above it not in its set. That means
8583 * that the remaining part of the union is precisely the same as the
8584 * non-exhausted list, so can just copy it unchanged. (If both list were
8585 * exhausted at the same time, then the operations below will be both 0.)
8588 IV copy_count; /* At most one will have a non-zero copy count */
8589 if ((copy_count = len_a - i_a) > 0) {
8590 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
8592 else if ((copy_count = len_b - i_b) > 0) {
8593 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
8597 /* We may be removing a reference to one of the inputs. If so, the output
8598 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8599 * count decremented) */
8600 if (a == *output || b == *output) {
8601 assert(! invlist_is_iterating(*output));
8602 if ((SvTEMP(*output))) {
8606 SvREFCNT_dec_NN(*output);
8616 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
8617 const bool complement_b, SV** i)
8619 /* Take the intersection of two inversion lists and point <i> to it. *i
8620 * SHOULD BE DEFINED upon input, and if it points to one of the two lists,
8621 * the reference count to that list will be decremented if not already a
8622 * temporary (mortal); otherwise *i will be made correspondingly mortal.
8623 * The first list, <a>, may be NULL, in which case an empty list is
8624 * returned. If <complement_b> is TRUE, the result will be the
8625 * intersection of <a> and the complement (or inversion) of <b> instead of
8628 * The basis for this comes from "Unicode Demystified" Chapter 13 by
8629 * Richard Gillam, published by Addison-Wesley, and explained at some
8630 * length there. The preface says to incorporate its examples into your
8631 * code at your own risk. In fact, it had bugs
8633 * The algorithm is like a merge sort, and is essentially the same as the
8637 const UV* array_a; /* a's array */
8639 UV len_a; /* length of a's array */
8642 SV* r; /* the resulting intersection */
8646 UV i_a = 0; /* current index into a's array */
8650 /* running count, as explained in the algorithm source book; items are
8651 * stopped accumulating and are output when the count changes to/from 2.
8652 * The count is incremented when we start a range that's in the set, and
8653 * decremented when we start a range that's not in the set. So its range
8654 * is 0 to 2. Only when the count is 2 is something in the intersection.
8658 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
8661 /* Special case if either one is empty */
8662 len_a = (a == NULL) ? 0 : _invlist_len(a);
8663 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
8664 bool make_temp = FALSE;
8666 if (len_a != 0 && complement_b) {
8668 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
8669 * be empty. Here, also we are using 'b's complement, which hence
8670 * must be every possible code point. Thus the intersection is
8674 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8679 *i = invlist_clone(a);
8681 /* else *i is already 'a' */
8689 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
8690 * intersection must be empty */
8692 if (! (make_temp = cBOOL(SvTEMP(a)))) {
8697 if (! (make_temp = cBOOL(SvTEMP(b)))) {
8701 *i = _new_invlist(0);
8709 /* Here both lists exist and are non-empty */
8710 array_a = invlist_array(a);
8711 array_b = invlist_array(b);
8713 /* If are to take the intersection of 'a' with the complement of b, set it
8714 * up so are looking at b's complement. */
8717 /* To complement, we invert: if the first element is 0, remove it. To
8718 * do this, we just pretend the array starts one later */
8719 if (array_b[0] == 0) {
8725 /* But if the first element is not zero, we pretend the list starts
8726 * at the 0 that is always stored immediately before the array. */
8732 /* Size the intersection for the worst case: that the intersection ends up
8733 * fragmenting everything to be completely disjoint */
8734 r= _new_invlist(len_a + len_b);
8736 /* Will contain U+0000 iff both components do */
8737 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
8738 && len_b > 0 && array_b[0] == 0);
8740 /* Go through each list item by item, stopping when exhausted one of
8742 while (i_a < len_a && i_b < len_b) {
8743 UV cp; /* The element to potentially add to the intersection's
8745 bool cp_in_set; /* Is it in the input list's set or not */
8747 /* We need to take one or the other of the two inputs for the
8748 * intersection. Since we are merging two sorted lists, we take the
8749 * smaller of the next items. In case of a tie, we take the one that
8750 * is not in its set first (a difference from the union algorithm). If
8751 * we took one in the set first, it would increment the count, possibly
8752 * to 2 which would cause it to be output as starting a range in the
8753 * intersection, and the next time through we would take that same
8754 * number, and output it again as ending the set. By doing it the
8755 * opposite of this, there is no possibility that the count will be
8756 * momentarily incremented to 2. (In a tie and both are in the set or
8757 * both not in the set, it doesn't matter which we take first.) */
8758 if (array_a[i_a] < array_b[i_b]
8759 || (array_a[i_a] == array_b[i_b]
8760 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
8762 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
8766 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
8770 /* Here, have chosen which of the two inputs to look at. Only output
8771 * if the running count changes to/from 2, which marks the
8772 * beginning/end of a range that's in the intersection */
8776 array_r[i_r++] = cp;
8781 array_r[i_r++] = cp;
8787 /* Here, we are finished going through at least one of the lists, which
8788 * means there is something remaining in at most one. We check if the list
8789 * that has been exhausted is positioned such that we are in the middle
8790 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
8791 * the ones we care about.) There are four cases:
8792 * 1) Both weren't in their sets, count is 0, and remains 0. There's
8793 * nothing left in the intersection.
8794 * 2) Both were in their sets, count is 2 and perhaps is incremented to
8795 * above 2. What should be output is exactly that which is in the
8796 * non-exhausted set, as everything it has is also in the intersection
8797 * set, and everything it doesn't have can't be in the intersection
8798 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
8799 * gets incremented to 2. Like the previous case, the intersection is
8800 * everything that remains in the non-exhausted set.
8801 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
8802 * remains 1. And the intersection has nothing more. */
8803 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
8804 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
8809 /* The final length is what we've output so far plus what else is in the
8810 * intersection. At most one of the subexpressions below will be non-zero
8814 len_r += (len_a - i_a) + (len_b - i_b);
8817 /* Set result to final length, which can change the pointer to array_r, so
8819 if (len_r != _invlist_len(r)) {
8820 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
8822 array_r = invlist_array(r);
8825 /* Finish outputting any remaining */
8826 if (count >= 2) { /* At most one will have a non-zero copy count */
8828 if ((copy_count = len_a - i_a) > 0) {
8829 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
8831 else if ((copy_count = len_b - i_b) > 0) {
8832 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
8836 /* We may be removing a reference to one of the inputs. If so, the output
8837 * is made mortal if the input was. (Mortal SVs shouldn't have their ref
8838 * count decremented) */
8839 if (a == *i || b == *i) {
8840 assert(! invlist_is_iterating(*i));
8845 SvREFCNT_dec_NN(*i);
8855 Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
8857 /* Add the range from 'start' to 'end' inclusive to the inversion list's
8858 * set. A pointer to the inversion list is returned. This may actually be
8859 * a new list, in which case the passed in one has been destroyed. The
8860 * passed in inversion list can be NULL, in which case a new one is created
8861 * with just the one range in it */
8866 if (invlist == NULL) {
8867 invlist = _new_invlist(2);
8871 len = _invlist_len(invlist);
8874 /* If comes after the final entry actually in the list, can just append it
8877 || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1)
8878 && start >= invlist_array(invlist)[len - 1]))
8880 _append_range_to_invlist(invlist, start, end);
8884 /* Here, can't just append things, create and return a new inversion list
8885 * which is the union of this range and the existing inversion list */
8886 range_invlist = _new_invlist(2);
8887 _append_range_to_invlist(range_invlist, start, end);
8889 _invlist_union(invlist, range_invlist, &invlist);
8891 /* The temporary can be freed */
8892 SvREFCNT_dec_NN(range_invlist);
8898 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
8899 UV** other_elements_ptr)
8901 /* Create and return an inversion list whose contents are to be populated
8902 * by the caller. The caller gives the number of elements (in 'size') and
8903 * the very first element ('element0'). This function will set
8904 * '*other_elements_ptr' to an array of UVs, where the remaining elements
8907 * Obviously there is some trust involved that the caller will properly
8908 * fill in the other elements of the array.
8910 * (The first element needs to be passed in, as the underlying code does
8911 * things differently depending on whether it is zero or non-zero) */
8913 SV* invlist = _new_invlist(size);
8916 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
8918 _append_range_to_invlist(invlist, element0, element0);
8919 offset = *get_invlist_offset_addr(invlist);
8921 invlist_set_len(invlist, size, offset);
8922 *other_elements_ptr = invlist_array(invlist) + 1;
8928 PERL_STATIC_INLINE SV*
8929 S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
8930 return _add_range_to_invlist(invlist, cp, cp);
8933 #ifndef PERL_IN_XSUB_RE
8935 Perl__invlist_invert(pTHX_ SV* const invlist)
8937 /* Complement the input inversion list. This adds a 0 if the list didn't
8938 * have a zero; removes it otherwise. As described above, the data
8939 * structure is set up so that this is very efficient */
8941 PERL_ARGS_ASSERT__INVLIST_INVERT;
8943 assert(! invlist_is_iterating(invlist));
8945 /* The inverse of matching nothing is matching everything */
8946 if (_invlist_len(invlist) == 0) {
8947 _append_range_to_invlist(invlist, 0, UV_MAX);
8951 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
8956 PERL_STATIC_INLINE SV*
8957 S_invlist_clone(pTHX_ SV* const invlist)
8960 /* Return a new inversion list that is a copy of the input one, which is
8961 * unchanged. The new list will not be mortal even if the old one was. */
8963 /* Need to allocate extra space to accommodate Perl's addition of a
8964 * trailing NUL to SvPV's, since it thinks they are always strings */
8965 SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
8966 STRLEN physical_length = SvCUR(invlist);
8967 bool offset = *(get_invlist_offset_addr(invlist));
8969 PERL_ARGS_ASSERT_INVLIST_CLONE;
8971 *(get_invlist_offset_addr(new_invlist)) = offset;
8972 invlist_set_len(new_invlist, _invlist_len(invlist), offset);
8973 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
8978 PERL_STATIC_INLINE STRLEN*
8979 S_get_invlist_iter_addr(SV* invlist)
8981 /* Return the address of the UV that contains the current iteration
8984 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
8986 assert(SvTYPE(invlist) == SVt_INVLIST);
8988 return &(((XINVLIST*) SvANY(invlist))->iterator);
8991 PERL_STATIC_INLINE void
8992 S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
8994 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
8996 *get_invlist_iter_addr(invlist) = 0;
8999 PERL_STATIC_INLINE void
9000 S_invlist_iterfinish(SV* invlist)
9002 /* Terminate iterator for invlist. This is to catch development errors.
9003 * Any iteration that is interrupted before completed should call this
9004 * function. Functions that add code points anywhere else but to the end
9005 * of an inversion list assert that they are not in the middle of an
9006 * iteration. If they were, the addition would make the iteration
9007 * problematical: if the iteration hadn't reached the place where things
9008 * were being added, it would be ok */
9010 PERL_ARGS_ASSERT_INVLIST_ITERFINISH;
9012 *get_invlist_iter_addr(invlist) = (STRLEN) UV_MAX;
9016 S_invlist_iternext(SV* invlist, UV* start, UV* end)
9018 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
9019 * This call sets in <*start> and <*end>, the next range in <invlist>.
9020 * Returns <TRUE> if successful and the next call will return the next
9021 * range; <FALSE> if was already at the end of the list. If the latter,
9022 * <*start> and <*end> are unchanged, and the next call to this function
9023 * will start over at the beginning of the list */
9025 STRLEN* pos = get_invlist_iter_addr(invlist);
9026 UV len = _invlist_len(invlist);
9029 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
9032 *pos = (STRLEN) UV_MAX; /* Force iterinit() to be required next time */
9036 array = invlist_array(invlist);
9038 *start = array[(*pos)++];
9044 *end = array[(*pos)++] - 1;
9050 PERL_STATIC_INLINE bool
9051 S_invlist_is_iterating(SV* const invlist)
9053 PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
9055 return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
9058 PERL_STATIC_INLINE UV
9059 S_invlist_highest(SV* const invlist)
9061 /* Returns the highest code point that matches an inversion list. This API
9062 * has an ambiguity, as it returns 0 under either the highest is actually
9063 * 0, or if the list is empty. If this distinction matters to you, check
9064 * for emptiness before calling this function */
9066 UV len = _invlist_len(invlist);
9069 PERL_ARGS_ASSERT_INVLIST_HIGHEST;
9075 array = invlist_array(invlist);
9077 /* The last element in the array in the inversion list always starts a
9078 * range that goes to infinity. That range may be for code points that are
9079 * matched in the inversion list, or it may be for ones that aren't
9080 * matched. In the latter case, the highest code point in the set is one
9081 * less than the beginning of this range; otherwise it is the final element
9082 * of this range: infinity */
9083 return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
9085 : array[len - 1] - 1;
9088 #ifndef PERL_IN_XSUB_RE
9090 Perl__invlist_contents(pTHX_ SV* const invlist)
9092 /* Get the contents of an inversion list into a string SV so that they can
9093 * be printed out. It uses the format traditionally done for debug tracing
9097 SV* output = newSVpvs("\n");
9099 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
9101 assert(! invlist_is_iterating(invlist));
9103 invlist_iterinit(invlist);
9104 while (invlist_iternext(invlist, &start, &end)) {
9105 if (end == UV_MAX) {
9106 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
9108 else if (end != start) {
9109 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
9113 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
9121 #ifndef PERL_IN_XSUB_RE
9123 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
9124 const char * const indent, SV* const invlist)
9126 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
9127 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
9128 * the string 'indent'. The output looks like this:
9129 [0] 0x000A .. 0x000D
9131 [4] 0x2028 .. 0x2029
9132 [6] 0x3104 .. INFINITY
9133 * This means that the first range of code points matched by the list are
9134 * 0xA through 0xD; the second range contains only the single code point
9135 * 0x85, etc. An inversion list is an array of UVs. Two array elements
9136 * are used to define each range (except if the final range extends to
9137 * infinity, only a single element is needed). The array index of the
9138 * first element for the corresponding range is given in brackets. */
9143 PERL_ARGS_ASSERT__INVLIST_DUMP;
9145 if (invlist_is_iterating(invlist)) {
9146 Perl_dump_indent(aTHX_ level, file,
9147 "%sCan't dump inversion list because is in middle of iterating\n",
9152 invlist_iterinit(invlist);
9153 while (invlist_iternext(invlist, &start, &end)) {
9154 if (end == UV_MAX) {
9155 Perl_dump_indent(aTHX_ level, file,
9156 "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
9157 indent, (UV)count, start);
9159 else if (end != start) {
9160 Perl_dump_indent(aTHX_ level, file,
9161 "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
9162 indent, (UV)count, start, end);
9165 Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
9166 indent, (UV)count, start);
9173 Perl__load_PL_utf8_foldclosures (pTHX)
9175 assert(! PL_utf8_foldclosures);
9177 /* If the folds haven't been read in, call a fold function
9179 if (! PL_utf8_tofold) {
9180 U8 dummy[UTF8_MAXBYTES_CASE+1];
9182 /* This string is just a short named one above \xff */
9183 to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
9184 assert(PL_utf8_tofold); /* Verify that worked */
9186 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
9190 #ifdef PERL_ARGS_ASSERT__INVLISTEQ
9192 S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
9194 /* Return a boolean as to if the two passed in inversion lists are
9195 * identical. The final argument, if TRUE, says to take the complement of
9196 * the second inversion list before doing the comparison */
9198 const UV* array_a = invlist_array(a);
9199 const UV* array_b = invlist_array(b);
9200 UV len_a = _invlist_len(a);
9201 UV len_b = _invlist_len(b);
9203 UV i = 0; /* current index into the arrays */
9204 bool retval = TRUE; /* Assume are identical until proven otherwise */
9206 PERL_ARGS_ASSERT__INVLISTEQ;
9208 /* If are to compare 'a' with the complement of b, set it
9209 * up so are looking at b's complement. */
9212 /* The complement of nothing is everything, so <a> would have to have
9213 * just one element, starting at zero (ending at infinity) */
9215 return (len_a == 1 && array_a[0] == 0);
9217 else if (array_b[0] == 0) {
9219 /* Otherwise, to complement, we invert. Here, the first element is
9220 * 0, just remove it. To do this, we just pretend the array starts
9228 /* But if the first element is not zero, we pretend the list starts
9229 * at the 0 that is always stored immediately before the array. */
9235 /* Make sure that the lengths are the same, as well as the final element
9236 * before looping through the remainder. (Thus we test the length, final,
9237 * and first elements right off the bat) */
9238 if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
9241 else for (i = 0; i < len_a - 1; i++) {
9242 if (array_a[i] != array_b[i]) {
9252 #undef HEADER_LENGTH
9253 #undef TO_INTERNAL_SIZE
9254 #undef FROM_INTERNAL_SIZE
9255 #undef INVLIST_VERSION_ID
9257 /* End of inversion list object */
9260 S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state)
9262 /* This parses the flags that are in either the '(?foo)' or '(?foo:bar)'
9263 * constructs, and updates RExC_flags with them. On input, RExC_parse
9264 * should point to the first flag; it is updated on output to point to the
9265 * final ')' or ':'. There needs to be at least one flag, or this will
9268 /* for (?g), (?gc), and (?o) warnings; warning
9269 about (?c) will warn about (?g) -- japhy */
9271 #define WASTED_O 0x01
9272 #define WASTED_G 0x02
9273 #define WASTED_C 0x04
9274 #define WASTED_GC (WASTED_G|WASTED_C)
9275 I32 wastedflags = 0x00;
9276 U32 posflags = 0, negflags = 0;
9277 U32 *flagsp = &posflags;
9278 char has_charset_modifier = '\0';
9280 bool has_use_defaults = FALSE;
9281 const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
9283 PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
9285 /* '^' as an initial flag sets certain defaults */
9286 if (UCHARAT(RExC_parse) == '^') {
9288 has_use_defaults = TRUE;
9289 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
9290 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
9291 ? REGEX_UNICODE_CHARSET
9292 : REGEX_DEPENDS_CHARSET);
9295 cs = get_regex_charset(RExC_flags);
9296 if (cs == REGEX_DEPENDS_CHARSET
9297 && (RExC_utf8 || RExC_uni_semantics))
9299 cs = REGEX_UNICODE_CHARSET;
9302 while (*RExC_parse) {
9303 /* && strchr("iogcmsx", *RExC_parse) */
9304 /* (?g), (?gc) and (?o) are useless here
9305 and must be globally applied -- japhy */
9306 switch (*RExC_parse) {
9308 /* Code for the imsx flags */
9309 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9311 case LOCALE_PAT_MOD:
9312 if (has_charset_modifier) {
9313 goto excess_modifier;
9315 else if (flagsp == &negflags) {
9318 cs = REGEX_LOCALE_CHARSET;
9319 has_charset_modifier = LOCALE_PAT_MOD;
9321 case UNICODE_PAT_MOD:
9322 if (has_charset_modifier) {
9323 goto excess_modifier;
9325 else if (flagsp == &negflags) {
9328 cs = REGEX_UNICODE_CHARSET;
9329 has_charset_modifier = UNICODE_PAT_MOD;
9331 case ASCII_RESTRICT_PAT_MOD:
9332 if (flagsp == &negflags) {
9335 if (has_charset_modifier) {
9336 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
9337 goto excess_modifier;
9339 /* Doubled modifier implies more restricted */
9340 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
9343 cs = REGEX_ASCII_RESTRICTED_CHARSET;
9345 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
9347 case DEPENDS_PAT_MOD:
9348 if (has_use_defaults) {
9349 goto fail_modifiers;
9351 else if (flagsp == &negflags) {
9354 else if (has_charset_modifier) {
9355 goto excess_modifier;
9358 /* The dual charset means unicode semantics if the
9359 * pattern (or target, not known until runtime) are
9360 * utf8, or something in the pattern indicates unicode
9362 cs = (RExC_utf8 || RExC_uni_semantics)
9363 ? REGEX_UNICODE_CHARSET
9364 : REGEX_DEPENDS_CHARSET;
9365 has_charset_modifier = DEPENDS_PAT_MOD;
9369 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
9370 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
9372 else if (has_charset_modifier == *(RExC_parse - 1)) {
9373 vFAIL2("Regexp modifier \"%c\" may not appear twice",
9377 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
9382 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
9385 case ONCE_PAT_MOD: /* 'o' */
9386 case GLOBAL_PAT_MOD: /* 'g' */
9387 if (PASS2 && ckWARN(WARN_REGEXP)) {
9388 const I32 wflagbit = *RExC_parse == 'o'
9391 if (! (wastedflags & wflagbit) ) {
9392 wastedflags |= wflagbit;
9393 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9396 "Useless (%s%c) - %suse /%c modifier",
9397 flagsp == &negflags ? "?-" : "?",
9399 flagsp == &negflags ? "don't " : "",
9406 case CONTINUE_PAT_MOD: /* 'c' */
9407 if (PASS2 && ckWARN(WARN_REGEXP)) {
9408 if (! (wastedflags & WASTED_C) ) {
9409 wastedflags |= WASTED_GC;
9410 /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
9413 "Useless (%sc) - %suse /gc modifier",
9414 flagsp == &negflags ? "?-" : "?",
9415 flagsp == &negflags ? "don't " : ""
9420 case KEEPCOPY_PAT_MOD: /* 'p' */
9421 if (flagsp == &negflags) {
9423 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
9425 *flagsp |= RXf_PMf_KEEPCOPY;
9429 /* A flag is a default iff it is following a minus, so
9430 * if there is a minus, it means will be trying to
9431 * re-specify a default which is an error */
9432 if (has_use_defaults || flagsp == &negflags) {
9433 goto fail_modifiers;
9436 wastedflags = 0; /* reset so (?g-c) warns twice */
9440 RExC_flags |= posflags;
9441 RExC_flags &= ~negflags;
9442 set_regex_charset(&RExC_flags, cs);
9443 if (RExC_flags & RXf_PMf_FOLD) {
9444 RExC_contains_i = 1;
9450 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9451 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9452 vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
9453 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9462 - reg - regular expression, i.e. main body or parenthesized thing
9464 * Caller must absorb opening parenthesis.
9466 * Combining parenthesis handling with the base level of regular expression
9467 * is a trifle forced, but the need to tie the tails of the branches to what
9468 * follows makes it hard to avoid.
9470 #define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
9472 #define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
9474 #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
9477 /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
9478 flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
9479 needs to be restarted.
9480 Otherwise would only return NULL if regbranch() returns NULL, which
9483 S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
9484 /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
9485 * 2 is like 1, but indicates that nextchar() has been called to advance
9486 * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
9487 * this flag alerts us to the need to check for that */
9489 regnode *ret; /* Will be the head of the group. */
9492 regnode *ender = NULL;
9495 U32 oregflags = RExC_flags;
9496 bool have_branch = 0;
9498 I32 freeze_paren = 0;
9499 I32 after_freeze = 0;
9500 I32 num; /* numeric backreferences */
9502 char * parse_start = RExC_parse; /* MJD */
9503 char * const oregcomp_parse = RExC_parse;
9505 GET_RE_DEBUG_FLAGS_DECL;
9507 PERL_ARGS_ASSERT_REG;
9508 DEBUG_PARSE("reg ");
9510 *flagp = 0; /* Tentatively. */
9513 /* Make an OPEN node, if parenthesized. */
9516 /* Under /x, space and comments can be gobbled up between the '(' and
9517 * here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
9518 * intervening space, as the sequence is a token, and a token should be
9520 bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
9522 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
9523 char *start_verb = RExC_parse;
9524 STRLEN verb_len = 0;
9525 char *start_arg = NULL;
9526 unsigned char op = 0;
9528 int internal_argval = 0; /* internal_argval is only useful if
9531 if (has_intervening_patws) {
9533 vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
9535 while ( *RExC_parse && *RExC_parse != ')' ) {
9536 if ( *RExC_parse == ':' ) {
9537 start_arg = RExC_parse + 1;
9543 verb_len = RExC_parse - start_verb;
9546 while ( *RExC_parse && *RExC_parse != ')' )
9548 if ( *RExC_parse != ')' )
9549 vFAIL("Unterminated verb pattern argument");
9550 if ( RExC_parse == start_arg )
9553 if ( *RExC_parse != ')' )
9554 vFAIL("Unterminated verb pattern");
9557 switch ( *start_verb ) {
9558 case 'A': /* (*ACCEPT) */
9559 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
9561 internal_argval = RExC_nestroot;
9564 case 'C': /* (*COMMIT) */
9565 if ( memEQs(start_verb,verb_len,"COMMIT") )
9568 case 'F': /* (*FAIL) */
9569 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
9574 case ':': /* (*:NAME) */
9575 case 'M': /* (*MARK:NAME) */
9576 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
9581 case 'P': /* (*PRUNE) */
9582 if ( memEQs(start_verb,verb_len,"PRUNE") )
9585 case 'S': /* (*SKIP) */
9586 if ( memEQs(start_verb,verb_len,"SKIP") )
9589 case 'T': /* (*THEN) */
9590 /* [19:06] <TimToady> :: is then */
9591 if ( memEQs(start_verb,verb_len,"THEN") ) {
9593 RExC_seen |= REG_CUTGROUP_SEEN;
9598 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
9600 "Unknown verb pattern '%"UTF8f"'",
9601 UTF8fARG(UTF, verb_len, start_verb));
9604 if ( start_arg && internal_argval ) {
9605 vFAIL3("Verb pattern '%.*s' may not have an argument",
9606 verb_len, start_verb);
9607 } else if ( argok < 0 && !start_arg ) {
9608 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
9609 verb_len, start_verb);
9611 ret = reganode(pRExC_state, op, internal_argval);
9612 if ( ! internal_argval && ! SIZE_ONLY ) {
9614 SV *sv = newSVpvn( start_arg,
9615 RExC_parse - start_arg);
9616 ARG(ret) = add_data( pRExC_state,
9618 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
9625 if (!internal_argval)
9626 RExC_seen |= REG_VERBARG_SEEN;
9627 } else if ( start_arg ) {
9628 vFAIL3("Verb pattern '%.*s' may not have an argument",
9629 verb_len, start_verb);
9631 ret = reg_node(pRExC_state, op);
9633 nextchar(pRExC_state);
9636 else if (*RExC_parse == '?') { /* (?...) */
9637 bool is_logical = 0;
9638 const char * const seqstart = RExC_parse;
9639 const char * endptr;
9640 if (has_intervening_patws) {
9642 vFAIL("In '(?...)', the '(' and '?' must be adjacent");
9646 paren = *RExC_parse++;
9647 ret = NULL; /* For look-ahead/behind. */
9650 case 'P': /* (?P...) variants for those used to PCRE/Python */
9651 paren = *RExC_parse++;
9652 if ( paren == '<') /* (?P<...>) named capture */
9654 else if (paren == '>') { /* (?P>name) named recursion */
9655 goto named_recursion;
9657 else if (paren == '=') { /* (?P=...) named backref */
9658 /* this pretty much dupes the code for \k<NAME> in
9659 * regatom(), if you change this make sure you change that
9661 char* name_start = RExC_parse;
9663 SV *sv_dat = reg_scan_name(pRExC_state,
9664 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9665 if (RExC_parse == name_start || *RExC_parse != ')')
9666 /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
9667 vFAIL2("Sequence %.3s... not terminated",parse_start);
9670 num = add_data( pRExC_state, STR_WITH_LEN("S"));
9671 RExC_rxi->data->data[num]=(void*)sv_dat;
9672 SvREFCNT_inc_simple_void(sv_dat);
9675 ret = reganode(pRExC_state,
9678 : (ASCII_FOLD_RESTRICTED)
9680 : (AT_LEAST_UNI_SEMANTICS)
9688 Set_Node_Offset(ret, parse_start+1);
9689 Set_Node_Cur_Length(ret, parse_start);
9691 nextchar(pRExC_state);
9695 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9696 vFAIL3("Sequence (%.*s...) not recognized",
9697 RExC_parse-seqstart, seqstart);
9699 case '<': /* (?<...) */
9700 if (*RExC_parse == '!')
9702 else if (*RExC_parse != '=')
9708 case '\'': /* (?'...') */
9709 name_start= RExC_parse;
9710 svname = reg_scan_name(pRExC_state,
9711 SIZE_ONLY /* reverse test from the others */
9712 ? REG_RSN_RETURN_NAME
9713 : REG_RSN_RETURN_NULL);
9714 if (RExC_parse == name_start || *RExC_parse != paren)
9715 vFAIL2("Sequence (?%c... not terminated",
9716 paren=='>' ? '<' : paren);
9720 if (!svname) /* shouldn't happen */
9722 "panic: reg_scan_name returned NULL");
9723 if (!RExC_paren_names) {
9724 RExC_paren_names= newHV();
9725 sv_2mortal(MUTABLE_SV(RExC_paren_names));
9727 RExC_paren_name_list= newAV();
9728 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
9731 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
9733 sv_dat = HeVAL(he_str);
9735 /* croak baby croak */
9737 "panic: paren_name hash element allocation failed");
9738 } else if ( SvPOK(sv_dat) ) {
9739 /* (?|...) can mean we have dupes so scan to check
9740 its already been stored. Maybe a flag indicating
9741 we are inside such a construct would be useful,
9742 but the arrays are likely to be quite small, so
9743 for now we punt -- dmq */
9744 IV count = SvIV(sv_dat);
9745 I32 *pv = (I32*)SvPVX(sv_dat);
9747 for ( i = 0 ; i < count ; i++ ) {
9748 if ( pv[i] == RExC_npar ) {
9754 pv = (I32*)SvGROW(sv_dat,
9755 SvCUR(sv_dat) + sizeof(I32)+1);
9756 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
9757 pv[count] = RExC_npar;
9758 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
9761 (void)SvUPGRADE(sv_dat,SVt_PVNV);
9762 sv_setpvn(sv_dat, (char *)&(RExC_npar),
9765 SvIV_set(sv_dat, 1);
9768 /* Yes this does cause a memory leak in debugging Perls
9770 if (!av_store(RExC_paren_name_list,
9771 RExC_npar, SvREFCNT_inc(svname)))
9772 SvREFCNT_dec_NN(svname);
9775 /*sv_dump(sv_dat);*/
9777 nextchar(pRExC_state);
9779 goto capturing_parens;
9781 RExC_seen |= REG_LOOKBEHIND_SEEN;
9782 RExC_in_lookbehind++;
9785 case '=': /* (?=...) */
9786 RExC_seen_zerolen++;
9788 case '!': /* (?!...) */
9789 RExC_seen_zerolen++;
9790 if (*RExC_parse == ')') {
9791 ret=reg_node(pRExC_state, OPFAIL);
9792 nextchar(pRExC_state);
9796 case '|': /* (?|...) */
9797 /* branch reset, behave like a (?:...) except that
9798 buffers in alternations share the same numbers */
9800 after_freeze = freeze_paren = RExC_npar;
9802 case ':': /* (?:...) */
9803 case '>': /* (?>...) */
9805 case '$': /* (?$...) */
9806 case '@': /* (?@...) */
9807 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
9809 case '0' : /* (?0) */
9810 case 'R' : /* (?R) */
9811 if (*RExC_parse != ')')
9812 FAIL("Sequence (?R) not terminated");
9813 ret = reg_node(pRExC_state, GOSTART);
9814 RExC_seen |= REG_GOSTART_SEEN;
9815 *flagp |= POSTPONED;
9816 nextchar(pRExC_state);
9819 /* named and numeric backreferences */
9820 case '&': /* (?&NAME) */
9821 parse_start = RExC_parse - 1;
9824 SV *sv_dat = reg_scan_name(pRExC_state,
9825 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
9826 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
9828 if (RExC_parse == RExC_end || *RExC_parse != ')')
9829 vFAIL("Sequence (?&... not terminated");
9830 goto gen_recurse_regop;
9831 assert(0); /* NOT REACHED */
9833 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9835 vFAIL("Illegal pattern");
9837 goto parse_recursion;
9839 case '-': /* (?-1) */
9840 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
9841 RExC_parse--; /* rewind to let it be handled later */
9845 case '1': case '2': case '3': case '4': /* (?1) */
9846 case '5': case '6': case '7': case '8': case '9':
9850 bool is_neg = FALSE;
9851 parse_start = RExC_parse - 1; /* MJD */
9852 if (*RExC_parse == '-') {
9856 num = grok_atou(RExC_parse, &endptr);
9858 RExC_parse = (char*)endptr;
9860 /* Some limit for num? */
9864 if (*RExC_parse!=')')
9865 vFAIL("Expecting close bracket");
9868 if ( paren == '-' ) {
9870 Diagram of capture buffer numbering.
9871 Top line is the normal capture buffer numbers
9872 Bottom line is the negative indexing as from
9876 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
9880 num = RExC_npar + num;
9883 vFAIL("Reference to nonexistent group");
9885 } else if ( paren == '+' ) {
9886 num = RExC_npar + num - 1;
9889 ret = reganode(pRExC_state, GOSUB, num);
9891 if (num > (I32)RExC_rx->nparens) {
9893 vFAIL("Reference to nonexistent group");
9895 ARG2L_SET( ret, RExC_recurse_count++);
9897 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
9898 "Recurse #%"UVuf" to %"IVdf"\n",
9899 (UV)ARG(ret), (IV)ARG2L(ret)));
9903 RExC_seen |= REG_RECURSE_SEEN;
9904 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
9905 Set_Node_Offset(ret, parse_start); /* MJD */
9907 *flagp |= POSTPONED;
9908 nextchar(pRExC_state);
9911 assert(0); /* NOT REACHED */
9913 case '?': /* (??...) */
9915 if (*RExC_parse != '{') {
9917 /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
9919 "Sequence (%"UTF8f"...) not recognized",
9920 UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
9923 *flagp |= POSTPONED;
9924 paren = *RExC_parse++;
9926 case '{': /* (?{...}) */
9929 struct reg_code_block *cb;
9931 RExC_seen_zerolen++;
9933 if ( !pRExC_state->num_code_blocks
9934 || pRExC_state->code_index >= pRExC_state->num_code_blocks
9935 || pRExC_state->code_blocks[pRExC_state->code_index].start
9936 != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
9939 if (RExC_pm_flags & PMf_USE_RE_EVAL)
9940 FAIL("panic: Sequence (?{...}): no code block found\n");
9941 FAIL("Eval-group not allowed at runtime, use re 'eval'");
9943 /* this is a pre-compiled code block (?{...}) */
9944 cb = &pRExC_state->code_blocks[pRExC_state->code_index];
9945 RExC_parse = RExC_start + cb->end;
9948 if (cb->src_regex) {
9949 n = add_data(pRExC_state, STR_WITH_LEN("rl"));
9950 RExC_rxi->data->data[n] =
9951 (void*)SvREFCNT_inc((SV*)cb->src_regex);
9952 RExC_rxi->data->data[n+1] = (void*)o;
9955 n = add_data(pRExC_state,
9956 (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
9957 RExC_rxi->data->data[n] = (void*)o;
9960 pRExC_state->code_index++;
9961 nextchar(pRExC_state);
9965 ret = reg_node(pRExC_state, LOGICAL);
9966 eval = reganode(pRExC_state, EVAL, n);
9969 /* for later propagation into (??{}) return value */
9970 eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
9972 REGTAIL(pRExC_state, ret, eval);
9973 /* deal with the length of this later - MJD */
9976 ret = reganode(pRExC_state, EVAL, n);
9977 Set_Node_Length(ret, RExC_parse - parse_start + 1);
9978 Set_Node_Offset(ret, parse_start);
9981 case '(': /* (?(?{...})...) and (?(?=...)...) */
9984 if (RExC_parse[0] == '?') { /* (?(?...)) */
9985 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
9986 || RExC_parse[1] == '<'
9987 || RExC_parse[1] == '{') { /* Lookahead or eval. */
9991 ret = reg_node(pRExC_state, LOGICAL);
9995 tail = reg(pRExC_state, 1, &flag, depth+1);
9996 if (flag & RESTART_UTF8) {
9997 *flagp = RESTART_UTF8;
10000 REGTAIL(pRExC_state, ret, tail);
10003 /* Fall through to ‘Unknown switch condition’ at the
10004 end of the if/else chain. */
10006 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
10007 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
10009 char ch = RExC_parse[0] == '<' ? '>' : '\'';
10010 char *name_start= RExC_parse++;
10012 SV *sv_dat=reg_scan_name(pRExC_state,
10013 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
10014 if (RExC_parse == name_start || *RExC_parse != ch)
10015 vFAIL2("Sequence (?(%c... not terminated",
10016 (ch == '>' ? '<' : ch));
10019 num = add_data( pRExC_state, STR_WITH_LEN("S"));
10020 RExC_rxi->data->data[num]=(void*)sv_dat;
10021 SvREFCNT_inc_simple_void(sv_dat);
10023 ret = reganode(pRExC_state,NGROUPP,num);
10024 goto insert_if_check_paren;
10026 else if (RExC_parse[0] == 'D' &&
10027 RExC_parse[1] == 'E' &&
10028 RExC_parse[2] == 'F' &&
10029 RExC_parse[3] == 'I' &&
10030 RExC_parse[4] == 'N' &&
10031 RExC_parse[5] == 'E')
10033 ret = reganode(pRExC_state,DEFINEP,0);
10036 goto insert_if_check_paren;
10038 else if (RExC_parse[0] == 'R') {
10041 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10042 parno = grok_atou(RExC_parse, &endptr);
10044 RExC_parse = (char*)endptr;
10045 } else if (RExC_parse[0] == '&') {
10048 sv_dat = reg_scan_name(pRExC_state,
10050 ? REG_RSN_RETURN_NULL
10051 : REG_RSN_RETURN_DATA);
10052 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
10054 ret = reganode(pRExC_state,INSUBP,parno);
10055 goto insert_if_check_paren;
10057 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
10061 parno = grok_atou(RExC_parse, &endptr);
10063 RExC_parse = (char*)endptr;
10064 ret = reganode(pRExC_state, GROUPP, parno);
10066 insert_if_check_paren:
10067 if (*(tmp = nextchar(pRExC_state)) != ')') {
10068 /* nextchar also skips comments, so undo its work
10069 * and skip over the the next character.
10072 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10073 vFAIL("Switch condition not recognized");
10076 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
10077 br = regbranch(pRExC_state, &flags, 1,depth+1);
10079 if (flags & RESTART_UTF8) {
10080 *flagp = RESTART_UTF8;
10083 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10086 REGTAIL(pRExC_state, br, reganode(pRExC_state,
10088 c = *nextchar(pRExC_state);
10089 if (flags&HASWIDTH)
10090 *flagp |= HASWIDTH;
10093 vFAIL("(?(DEFINE)....) does not allow branches");
10095 /* Fake one for optimizer. */
10096 lastbr = reganode(pRExC_state, IFTHEN, 0);
10098 if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
10099 if (flags & RESTART_UTF8) {
10100 *flagp = RESTART_UTF8;
10103 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
10106 REGTAIL(pRExC_state, ret, lastbr);
10107 if (flags&HASWIDTH)
10108 *flagp |= HASWIDTH;
10109 c = *nextchar(pRExC_state);
10114 vFAIL("Switch (?(condition)... contains too many branches");
10115 ender = reg_node(pRExC_state, TAIL);
10116 REGTAIL(pRExC_state, br, ender);
10118 REGTAIL(pRExC_state, lastbr, ender);
10119 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10122 REGTAIL(pRExC_state, ret, ender);
10123 RExC_size++; /* XXX WHY do we need this?!!
10124 For large programs it seems to be required
10125 but I can't figure out why. -- dmq*/
10128 RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
10129 vFAIL("Unknown switch condition (?(...))");
10131 case '[': /* (?[ ... ]) */
10132 return handle_regex_sets(pRExC_state, NULL, flagp, depth,
10135 RExC_parse--; /* for vFAIL to print correctly */
10136 vFAIL("Sequence (? incomplete");
10138 default: /* e.g., (?i) */
10141 parse_lparen_question_flags(pRExC_state);
10142 if (UCHARAT(RExC_parse) != ':') {
10143 nextchar(pRExC_state);
10148 nextchar(pRExC_state);
10158 ret = reganode(pRExC_state, OPEN, parno);
10160 if (!RExC_nestroot)
10161 RExC_nestroot = parno;
10162 if (RExC_seen & REG_RECURSE_SEEN
10163 && !RExC_open_parens[parno-1])
10165 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10166 "Setting open paren #%"IVdf" to %d\n",
10167 (IV)parno, REG_NODE_NUM(ret)));
10168 RExC_open_parens[parno-1]= ret;
10171 Set_Node_Length(ret, 1); /* MJD */
10172 Set_Node_Offset(ret, RExC_parse); /* MJD */
10180 /* Pick up the branches, linking them together. */
10181 parse_start = RExC_parse; /* MJD */
10182 br = regbranch(pRExC_state, &flags, 1,depth+1);
10184 /* branch_len = (paren != 0); */
10187 if (flags & RESTART_UTF8) {
10188 *flagp = RESTART_UTF8;
10191 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10193 if (*RExC_parse == '|') {
10194 if (!SIZE_ONLY && RExC_extralen) {
10195 reginsert(pRExC_state, BRANCHJ, br, depth+1);
10198 reginsert(pRExC_state, BRANCH, br, depth+1);
10199 Set_Node_Length(br, paren != 0);
10200 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
10204 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
10206 else if (paren == ':') {
10207 *flagp |= flags&SIMPLE;
10209 if (is_open) { /* Starts with OPEN. */
10210 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
10212 else if (paren != '?') /* Not Conditional */
10214 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10216 while (*RExC_parse == '|') {
10217 if (!SIZE_ONLY && RExC_extralen) {
10218 ender = reganode(pRExC_state, LONGJMP,0);
10220 /* Append to the previous. */
10221 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
10224 RExC_extralen += 2; /* Account for LONGJMP. */
10225 nextchar(pRExC_state);
10226 if (freeze_paren) {
10227 if (RExC_npar > after_freeze)
10228 after_freeze = RExC_npar;
10229 RExC_npar = freeze_paren;
10231 br = regbranch(pRExC_state, &flags, 0, depth+1);
10234 if (flags & RESTART_UTF8) {
10235 *flagp = RESTART_UTF8;
10238 FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
10240 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
10242 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
10245 if (have_branch || paren != ':') {
10246 /* Make a closing node, and hook it on the end. */
10249 ender = reg_node(pRExC_state, TAIL);
10252 ender = reganode(pRExC_state, CLOSE, parno);
10253 if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
10254 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
10255 "Setting close paren #%"IVdf" to %d\n",
10256 (IV)parno, REG_NODE_NUM(ender)));
10257 RExC_close_parens[parno-1]= ender;
10258 if (RExC_nestroot == parno)
10261 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
10262 Set_Node_Length(ender,1); /* MJD */
10268 *flagp &= ~HASWIDTH;
10271 ender = reg_node(pRExC_state, SUCCEED);
10274 ender = reg_node(pRExC_state, END);
10276 assert(!RExC_opend); /* there can only be one! */
10277 RExC_opend = ender;
10281 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10282 SV * const mysv_val1=sv_newmortal();
10283 SV * const mysv_val2=sv_newmortal();
10284 DEBUG_PARSE_MSG("lsbr");
10285 regprop(RExC_rx, mysv_val1, lastbr, NULL);
10286 regprop(RExC_rx, mysv_val2, ender, NULL);
10287 PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10288 SvPV_nolen_const(mysv_val1),
10289 (IV)REG_NODE_NUM(lastbr),
10290 SvPV_nolen_const(mysv_val2),
10291 (IV)REG_NODE_NUM(ender),
10292 (IV)(ender - lastbr)
10295 REGTAIL(pRExC_state, lastbr, ender);
10297 if (have_branch && !SIZE_ONLY) {
10298 char is_nothing= 1;
10300 RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
10302 /* Hook the tails of the branches to the closing node. */
10303 for (br = ret; br; br = regnext(br)) {
10304 const U8 op = PL_regkind[OP(br)];
10305 if (op == BRANCH) {
10306 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
10307 if ( OP(NEXTOPER(br)) != NOTHING
10308 || regnext(NEXTOPER(br)) != ender)
10311 else if (op == BRANCHJ) {
10312 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
10313 /* for now we always disable this optimisation * /
10314 if ( OP(NEXTOPER(NEXTOPER(br))) != NOTHING
10315 || regnext(NEXTOPER(NEXTOPER(br))) != ender)
10321 br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
10322 DEBUG_PARSE_r(if (!SIZE_ONLY) {
10323 SV * const mysv_val1=sv_newmortal();
10324 SV * const mysv_val2=sv_newmortal();
10325 DEBUG_PARSE_MSG("NADA");
10326 regprop(RExC_rx, mysv_val1, ret, NULL);
10327 regprop(RExC_rx, mysv_val2, ender, NULL);
10328 PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
10329 SvPV_nolen_const(mysv_val1),
10330 (IV)REG_NODE_NUM(ret),
10331 SvPV_nolen_const(mysv_val2),
10332 (IV)REG_NODE_NUM(ender),
10337 if (OP(ender) == TAIL) {
10342 for ( opt= br + 1; opt < ender ; opt++ )
10343 OP(opt)= OPTIMIZED;
10344 NEXT_OFF(br)= ender - br;
10352 static const char parens[] = "=!<,>";
10354 if (paren && (p = strchr(parens, paren))) {
10355 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
10356 int flag = (p - parens) > 1;
10359 node = SUSPEND, flag = 0;
10360 reginsert(pRExC_state, node,ret, depth+1);
10361 Set_Node_Cur_Length(ret, parse_start);
10362 Set_Node_Offset(ret, parse_start + 1);
10364 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
10368 /* Check for proper termination. */
10370 /* restore original flags, but keep (?p) */
10371 RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
10372 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
10373 RExC_parse = oregcomp_parse;
10374 vFAIL("Unmatched (");
10377 else if (!paren && RExC_parse < RExC_end) {
10378 if (*RExC_parse == ')') {
10380 vFAIL("Unmatched )");
10383 FAIL("Junk on end of regexp"); /* "Can't happen". */
10384 assert(0); /* NOTREACHED */
10387 if (RExC_in_lookbehind) {
10388 RExC_in_lookbehind--;
10390 if (after_freeze > RExC_npar)
10391 RExC_npar = after_freeze;
10396 - regbranch - one alternative of an | operator
10398 * Implements the concatenation operator.
10400 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10404 S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
10407 regnode *chain = NULL;
10409 I32 flags = 0, c = 0;
10410 GET_RE_DEBUG_FLAGS_DECL;
10412 PERL_ARGS_ASSERT_REGBRANCH;
10414 DEBUG_PARSE("brnc");
10419 if (!SIZE_ONLY && RExC_extralen)
10420 ret = reganode(pRExC_state, BRANCHJ,0);
10422 ret = reg_node(pRExC_state, BRANCH);
10423 Set_Node_Length(ret, 1);
10427 if (!first && SIZE_ONLY)
10428 RExC_extralen += 1; /* BRANCHJ */
10430 *flagp = WORST; /* Tentatively. */
10433 nextchar(pRExC_state);
10434 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
10435 flags &= ~TRYAGAIN;
10436 latest = regpiece(pRExC_state, &flags,depth+1);
10437 if (latest == NULL) {
10438 if (flags & TRYAGAIN)
10440 if (flags & RESTART_UTF8) {
10441 *flagp = RESTART_UTF8;
10444 FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
10446 else if (ret == NULL)
10448 *flagp |= flags&(HASWIDTH|POSTPONED);
10449 if (chain == NULL) /* First piece. */
10450 *flagp |= flags&SPSTART;
10453 REGTAIL(pRExC_state, chain, latest);
10458 if (chain == NULL) { /* Loop ran zero times. */
10459 chain = reg_node(pRExC_state, NOTHING);
10464 *flagp |= flags&SIMPLE;
10471 - regpiece - something followed by possible [*+?]
10473 * Note that the branching code sequences used for ? and the general cases
10474 * of * and + are somewhat optimized: they use the same NOTHING node as
10475 * both the endmarker for their branch list and the body of the last branch.
10476 * It might seem that this node could be dispensed with entirely, but the
10477 * endmarker role is not redundant.
10479 * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
10481 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
10485 S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
10491 const char * const origparse = RExC_parse;
10493 I32 max = REG_INFTY;
10494 #ifdef RE_TRACK_PATTERN_OFFSETS
10497 const char *maxpos = NULL;
10499 /* Save the original in case we change the emitted regop to a FAIL. */
10500 regnode * const orig_emit = RExC_emit;
10502 GET_RE_DEBUG_FLAGS_DECL;
10504 PERL_ARGS_ASSERT_REGPIECE;
10506 DEBUG_PARSE("piec");
10508 ret = regatom(pRExC_state, &flags,depth+1);
10510 if (flags & (TRYAGAIN|RESTART_UTF8))
10511 *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
10513 FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
10519 if (op == '{' && regcurly(RExC_parse)) {
10521 #ifdef RE_TRACK_PATTERN_OFFSETS
10522 parse_start = RExC_parse; /* MJD */
10524 next = RExC_parse + 1;
10525 while (isDIGIT(*next) || *next == ',') {
10526 if (*next == ',') {
10534 if (*next == '}') { /* got one */
10535 const char* endptr;
10539 min = grok_atou(RExC_parse, &endptr);
10540 if (*maxpos == ',')
10543 maxpos = RExC_parse;
10544 max = grok_atou(maxpos, &endptr);
10545 if (!max && *maxpos != '0')
10546 max = REG_INFTY; /* meaning "infinity" */
10547 else if (max >= REG_INFTY)
10548 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
10550 nextchar(pRExC_state);
10551 if (max < min) { /* If can't match, warn and optimize to fail
10555 /* We can't back off the size because we have to reserve
10556 * enough space for all the things we are about to throw
10557 * away, but we can shrink it by the ammount we are about
10558 * to re-use here */
10559 RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
10562 ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
10563 RExC_emit = orig_emit;
10565 ret = reg_node(pRExC_state, OPFAIL);
10568 else if (min == max
10569 && RExC_parse < RExC_end
10570 && (*RExC_parse == '?' || *RExC_parse == '+'))
10573 ckWARN2reg(RExC_parse + 1,
10574 "Useless use of greediness modifier '%c'",
10577 /* Absorb the modifier, so later code doesn't see nor use
10579 nextchar(pRExC_state);
10583 if ((flags&SIMPLE)) {
10584 RExC_naughty += 2 + RExC_naughty / 2;
10585 reginsert(pRExC_state, CURLY, ret, depth+1);
10586 Set_Node_Offset(ret, parse_start+1); /* MJD */
10587 Set_Node_Cur_Length(ret, parse_start);
10590 regnode * const w = reg_node(pRExC_state, WHILEM);
10593 REGTAIL(pRExC_state, ret, w);
10594 if (!SIZE_ONLY && RExC_extralen) {
10595 reginsert(pRExC_state, LONGJMP,ret, depth+1);
10596 reginsert(pRExC_state, NOTHING,ret, depth+1);
10597 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
10599 reginsert(pRExC_state, CURLYX,ret, depth+1);
10601 Set_Node_Offset(ret, parse_start+1);
10602 Set_Node_Length(ret,
10603 op == '{' ? (RExC_parse - parse_start) : 1);
10605 if (!SIZE_ONLY && RExC_extralen)
10606 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
10607 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
10609 RExC_whilem_seen++, RExC_extralen += 3;
10610 RExC_naughty += 4 + RExC_naughty; /* compound interest */
10617 *flagp |= HASWIDTH;
10619 ARG1_SET(ret, (U16)min);
10620 ARG2_SET(ret, (U16)max);
10622 if (max == REG_INFTY)
10623 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10629 if (!ISMULT1(op)) {
10634 #if 0 /* Now runtime fix should be reliable. */
10636 /* if this is reinstated, don't forget to put this back into perldiag:
10638 =item Regexp *+ operand could be empty at {#} in regex m/%s/
10640 (F) The part of the regexp subject to either the * or + quantifier
10641 could match an empty string. The {#} shows in the regular
10642 expression about where the problem was discovered.
10646 if (!(flags&HASWIDTH) && op != '?')
10647 vFAIL("Regexp *+ operand could be empty");
10650 #ifdef RE_TRACK_PATTERN_OFFSETS
10651 parse_start = RExC_parse;
10653 nextchar(pRExC_state);
10655 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
10657 if (op == '*' && (flags&SIMPLE)) {
10658 reginsert(pRExC_state, STAR, ret, depth+1);
10661 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10663 else if (op == '*') {
10667 else if (op == '+' && (flags&SIMPLE)) {
10668 reginsert(pRExC_state, PLUS, ret, depth+1);
10671 RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
10673 else if (op == '+') {
10677 else if (op == '?') {
10682 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
10683 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
10684 ckWARN2reg(RExC_parse,
10685 "%"UTF8f" matches null string many times",
10686 UTF8fARG(UTF, (RExC_parse >= origparse
10687 ? RExC_parse - origparse
10690 (void)ReREFCNT_inc(RExC_rx_sv);
10693 if (RExC_parse < RExC_end && *RExC_parse == '?') {
10694 nextchar(pRExC_state);
10695 reginsert(pRExC_state, MINMOD, ret, depth+1);
10696 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
10699 if (RExC_parse < RExC_end && *RExC_parse == '+') {
10701 nextchar(pRExC_state);
10702 ender = reg_node(pRExC_state, SUCCEED);
10703 REGTAIL(pRExC_state, ret, ender);
10704 reginsert(pRExC_state, SUSPEND, ret, depth+1);
10706 ender = reg_node(pRExC_state, TAIL);
10707 REGTAIL(pRExC_state, ret, ender);
10710 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
10712 vFAIL("Nested quantifiers");
10719 S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
10720 UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
10724 /* This is expected to be called by a parser routine that has recognized '\N'
10725 and needs to handle the rest. RExC_parse is expected to point at the first
10726 char following the N at the time of the call. On successful return,
10727 RExC_parse has been updated to point to just after the sequence identified
10728 by this routine, <*flagp> has been updated, and the non-NULL input pointers
10729 have been set appropriately.
10731 The typical case for this is \N{some character name}. This is usually
10732 called while parsing the input, filling in or ready to fill in an EXACTish
10733 node, and the code point for the character should be returned, so that it
10734 can be added to the node, and parsing continued with the next input
10735 character. But it may be that instead of a single character the \N{}
10736 expands to more than one, a named sequence. In this case any following
10737 quantifier applies to the whole sequence, and it is easier, given the code
10738 structure that calls this, to handle it from a different area of the code.
10739 For this reason, the input parameters can be set so that it returns valid
10740 only on one or the other of these cases.
10742 Another possibility is for the input to be an empty \N{}, which for
10743 backwards compatibility we accept, but generate a NOTHING node which should
10744 later get optimized out. This is handled from the area of code which can
10745 handle a named sequence, so if called with the parameters for the other, it
10748 Still another possibility is for the \N to mean [^\n], and not a single
10749 character or explicit sequence at all. This is determined by context.
10750 Again, this is handled from the area of code which can handle a named
10751 sequence, so if called with the parameters for the other, it also fails.
10753 And the final possibility is for the \N to be called from within a bracketed
10754 character class. In this case the [^\n] meaning makes no sense, and so is
10755 an error. Other anomalous situations are left to the calling code to handle.
10757 For non-single-quoted regexes, the tokenizer has attempted to decide which
10758 of the above applies, and in the case of a named sequence, has converted it
10759 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
10760 where c1... are the characters in the sequence. For single-quoted regexes,
10761 the tokenizer passes the \N sequence through unchanged; this code will not
10762 attempt to determine this nor expand those, instead raising a syntax error.
10763 The net effect is that if the beginning of the passed-in pattern isn't '{U+'
10764 or there is no '}', it signals that this \N occurrence means to match a
10765 non-newline. (This mostly was done because of [perl #56444].)
10767 The API is somewhat convoluted due to historical and the above reasons.
10769 The function raises an error (via vFAIL), and doesn't return for various
10770 syntax errors. For other failures, it returns (STRLEN) -1. For successes,
10771 it returns a count of how many characters were accounted for by it. (This
10772 can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
10773 points in the sequence. It sets <node_p>, <valuep>, and/or
10774 <substitute_parse> on success.
10776 If <valuep> is non-null, it means the caller can accept an input sequence
10777 consisting of a just a single code point; <*valuep> is set to the value
10778 of the only or first code point in the input.
10780 If <substitute_parse> is non-null, it means the caller can accept an input
10781 sequence consisting of one or more code points; <*substitute_parse> is a
10782 newly created mortal SV* in this case, containing \x{} escapes representing
10785 Both <valuep> and <substitute_parse> can be non-NULL.
10787 If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
10788 that the caller can accept any legal sequence other than a single code
10789 point. To wit, <*node_p> is set as follows:
10790 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
10791 2) \N{}: points to a new NOTHING node; return is 0
10792 3) otherwise: points to a new EXACT node containing the resolved
10793 string; return is the number of code points in the
10794 string. This will never be 1.
10795 Note that failure is returned for single code point sequences if <valuep> is
10796 null and <node_p> is not.
10799 char * endbrace; /* '}' following the name */
10801 char *endchar; /* Points to '.' or '}' ending cur char in the input
10803 bool has_multiple_chars; /* true if the input stream contains a sequence of
10804 more than one character */
10805 bool in_char_class = substitute_parse != NULL;
10806 STRLEN count = 0; /* Number of characters in this sequence */
10808 GET_RE_DEBUG_FLAGS_DECL;
10810 PERL_ARGS_ASSERT_GROK_BSLASH_N;
10812 GET_RE_DEBUG_FLAGS;
10814 assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
10815 assert(! (node_p && substitute_parse)); /* At most 1 should be set */
10817 /* The [^\n] meaning of \N ignores spaces and comments under the /x
10818 * modifier. The other meaning does not, so use a temporary until we find
10819 * out which we are being called with */
10820 p = (RExC_flags & RXf_PMf_EXTENDED)
10821 ? regpatws(pRExC_state, RExC_parse,
10822 TRUE) /* means recognize comments */
10825 /* Disambiguate between \N meaning a named character versus \N meaning
10826 * [^\n]. The former is assumed when it can't be the latter. */
10827 if (*p != '{' || regcurly(p)) {
10830 /* no bare \N allowed in a charclass */
10831 if (in_char_class) {
10832 vFAIL("\\N in a character class must be a named character: \\N{...}");
10834 return (STRLEN) -1;
10836 RExC_parse--; /* Need to back off so nextchar() doesn't skip the
10838 nextchar(pRExC_state);
10839 *node_p = reg_node(pRExC_state, REG_ANY);
10840 *flagp |= HASWIDTH|SIMPLE;
10842 Set_Node_Length(*node_p, 1); /* MJD */
10846 /* Here, we have decided it should be a named character or sequence */
10848 /* The test above made sure that the next real character is a '{', but
10849 * under the /x modifier, it could be separated by space (or a comment and
10850 * \n) and this is not allowed (for consistency with \x{...} and the
10851 * tokenizer handling of \N{NAME}). */
10852 if (*RExC_parse != '{') {
10853 vFAIL("Missing braces on \\N{}");
10856 RExC_parse++; /* Skip past the '{' */
10858 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
10859 || ! (endbrace == RExC_parse /* nothing between the {} */
10860 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
10862 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
10865 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
10866 vFAIL("\\N{NAME} must be resolved by the lexer");
10869 if (endbrace == RExC_parse) { /* empty: \N{} */
10871 *node_p = reg_node(pRExC_state,NOTHING);
10873 else if (! in_char_class) {
10874 return (STRLEN) -1;
10876 nextchar(pRExC_state);
10880 RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
10881 RExC_parse += 2; /* Skip past the 'U+' */
10883 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10885 /* Code points are separated by dots. If none, there is only one code
10886 * point, and is terminated by the brace */
10887 has_multiple_chars = (endchar < endbrace);
10889 /* We get the first code point if we want it, and either there is only one,
10890 * or we can accept both cases of one and more than one */
10891 if (valuep && (substitute_parse || ! has_multiple_chars)) {
10892 STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
10893 I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
10894 | PERL_SCAN_DISALLOW_PREFIX
10896 /* No errors in the first pass (See [perl
10897 * #122671].) We let the code below find the
10898 * errors when there are multiple chars. */
10899 | ((SIZE_ONLY || has_multiple_chars)
10900 ? PERL_SCAN_SILENT_ILLDIGIT
10903 *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
10905 /* The tokenizer should have guaranteed validity, but it's possible to
10906 * bypass it by using single quoting, so check. Don't do the check
10907 * here when there are multiple chars; we do it below anyway. */
10908 if (! has_multiple_chars) {
10909 if (length_of_hex == 0
10910 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
10912 RExC_parse += length_of_hex; /* Includes all the valid */
10913 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
10914 ? UTF8SKIP(RExC_parse)
10916 /* Guard against malformed utf8 */
10917 if (RExC_parse >= endchar) {
10918 RExC_parse = endchar;
10920 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10923 RExC_parse = endbrace + 1;
10928 /* Here, we should have already handled the case where a single character
10929 * is expected and found. So it is a failure if we aren't expecting
10930 * multiple chars and got them; or didn't get them but wanted them. We
10931 * fail without advancing the parse, so that the caller can try again with
10932 * different acceptance criteria */
10933 if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
10935 return (STRLEN) -1;
10940 /* What is done here is to convert this to a sub-pattern of the form
10941 * \x{char1}\x{char2}...
10942 * and then either return it in <*substitute_parse> if non-null; or
10943 * call reg recursively to parse it (enclosing in "(?: ... )" ). That
10944 * way, it retains its atomicness, while not having to worry about
10945 * special handling that some code points may have. toke.c has
10946 * converted the original Unicode values to native, so that we can just
10947 * pass on the hex values unchanged. We do have to set a flag to keep
10948 * recoding from happening in the recursion */
10952 char *orig_end = RExC_end;
10955 if (substitute_parse) {
10956 *substitute_parse = newSVpvs("");
10959 substitute_parse = &dummy;
10960 *substitute_parse = newSVpvs("?:");
10962 *substitute_parse = sv_2mortal(*substitute_parse);
10964 while (RExC_parse < endbrace) {
10966 /* Convert to notation the rest of the code understands */
10967 sv_catpv(*substitute_parse, "\\x{");
10968 sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
10969 sv_catpv(*substitute_parse, "}");
10971 /* Point to the beginning of the next character in the sequence. */
10972 RExC_parse = endchar + 1;
10973 endchar = RExC_parse + strcspn(RExC_parse, ".}");
10977 if (! in_char_class) {
10978 sv_catpv(*substitute_parse, ")");
10981 RExC_parse = SvPV(*substitute_parse, len);
10983 /* Don't allow empty number */
10984 if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
10985 RExC_parse = endbrace;
10986 vFAIL("Invalid hexadecimal number in \\N{U+...}");
10988 RExC_end = RExC_parse + len;
10990 /* The values are Unicode, and therefore not subject to recoding */
10991 RExC_override_recoding = 1;
10994 if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
10995 if (flags & RESTART_UTF8) {
10996 *flagp = RESTART_UTF8;
10997 return (STRLEN) -1;
10999 FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
11002 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11005 RExC_parse = endbrace;
11006 RExC_end = orig_end;
11007 RExC_override_recoding = 0;
11009 nextchar(pRExC_state);
11019 * It returns the code point in utf8 for the value in *encp.
11020 * value: a code value in the source encoding
11021 * encp: a pointer to an Encode object
11023 * If the result from Encode is not a single character,
11024 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
11027 S_reg_recode(pTHX_ const char value, SV **encp)
11030 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
11031 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
11032 const STRLEN newlen = SvCUR(sv);
11033 UV uv = UNICODE_REPLACEMENT;
11035 PERL_ARGS_ASSERT_REG_RECODE;
11039 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
11042 if (!newlen || numlen != newlen) {
11043 uv = UNICODE_REPLACEMENT;
11049 PERL_STATIC_INLINE U8
11050 S_compute_EXACTish(RExC_state_t *pRExC_state)
11054 PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
11060 op = get_regex_charset(RExC_flags);
11061 if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
11062 op--; /* /a is same as /u, and map /aa's offset to what /a's would have
11063 been, so there is no hole */
11066 return op + EXACTF;
11069 PERL_STATIC_INLINE void
11070 S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
11071 regnode *node, I32* flagp, STRLEN len, UV code_point,
11074 /* This knows the details about sizing an EXACTish node, setting flags for
11075 * it (by setting <*flagp>, and potentially populating it with a single
11078 * If <len> (the length in bytes) is non-zero, this function assumes that
11079 * the node has already been populated, and just does the sizing. In this
11080 * case <code_point> should be the final code point that has already been
11081 * placed into the node. This value will be ignored except that under some
11082 * circumstances <*flagp> is set based on it.
11084 * If <len> is zero, the function assumes that the node is to contain only
11085 * the single character given by <code_point> and calculates what <len>
11086 * should be. In pass 1, it sizes the node appropriately. In pass 2, it
11087 * additionally will populate the node's STRING with <code_point> or its
11090 * In both cases <*flagp> is appropriately set
11092 * It knows that under FOLD, the Latin Sharp S and UTF characters above
11093 * 255, must be folded (the former only when the rules indicate it can
11096 * When it does the populating, it looks at the flag 'downgradable'. If
11097 * true with a node that folds, it checks if the single code point
11098 * participates in a fold, and if not downgrades the node to an EXACT.
11099 * This helps the optimizer */
11101 bool len_passed_in = cBOOL(len != 0);
11102 U8 character[UTF8_MAXBYTES_CASE+1];
11104 PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
11106 /* Don't bother to check for downgrading in PASS1, as it doesn't make any
11107 * sizing difference, and is extra work that is thrown away */
11108 if (downgradable && ! PASS2) {
11109 downgradable = FALSE;
11112 if (! len_passed_in) {
11114 if (UNI_IS_INVARIANT(code_point)) {
11115 if (LOC || ! FOLD) { /* /l defers folding until runtime */
11116 *character = (U8) code_point;
11118 else { /* Here is /i and not /l. (toFOLD() is defined on just
11119 ASCII, which isn't the same thing as INVARIANT on
11120 EBCDIC, but it works there, as the extra invariants
11121 fold to themselves) */
11122 *character = toFOLD((U8) code_point);
11124 /* We can downgrade to an EXACT node if this character
11125 * isn't a folding one. Note that this assumes that
11126 * nothing above Latin1 folds to some other invariant than
11127 * one of these alphabetics; otherwise we would also have
11129 * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11130 * || ASCII_FOLD_RESTRICTED))
11132 if (downgradable && PL_fold[code_point] == code_point) {
11138 else if (FOLD && (! LOC
11139 || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
11140 { /* Folding, and ok to do so now */
11141 UV folded = _to_uni_fold_flags(
11145 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
11146 ? FOLD_FLAGS_NOMIX_ASCII
11149 && folded == code_point /* This quickly rules out many
11150 cases, avoiding the
11151 _invlist_contains_cp() overhead
11153 && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
11158 else if (code_point <= MAX_UTF8_TWO_BYTE) {
11160 /* Not folding this cp, and can output it directly */
11161 *character = UTF8_TWO_BYTE_HI(code_point);
11162 *(character + 1) = UTF8_TWO_BYTE_LO(code_point);
11166 uvchr_to_utf8( character, code_point);
11167 len = UTF8SKIP(character);
11169 } /* Else pattern isn't UTF8. */
11171 *character = (U8) code_point;
11173 } /* Else is folded non-UTF8 */
11174 else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
11176 /* We don't fold any non-UTF8 except possibly the Sharp s (see
11177 * comments at join_exact()); */
11178 *character = (U8) code_point;
11181 /* Can turn into an EXACT node if we know the fold at compile time,
11182 * and it folds to itself and doesn't particpate in other folds */
11185 && PL_fold_latin1[code_point] == code_point
11186 && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
11187 || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
11191 } /* else is Sharp s. May need to fold it */
11192 else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
11194 *(character + 1) = 's';
11198 *character = LATIN_SMALL_LETTER_SHARP_S;
11204 RExC_size += STR_SZ(len);
11207 RExC_emit += STR_SZ(len);
11208 STR_LEN(node) = len;
11209 if (! len_passed_in) {
11210 Copy((char *) character, STRING(node), len, char);
11214 *flagp |= HASWIDTH;
11216 /* A single character node is SIMPLE, except for the special-cased SHARP S
11218 if ((len == 1 || (UTF && len == UNISKIP(code_point)))
11219 && (code_point != LATIN_SMALL_LETTER_SHARP_S
11220 || ! FOLD || ! DEPENDS_SEMANTICS))
11225 /* The OP may not be well defined in PASS1 */
11226 if (PASS2 && OP(node) == EXACTFL) {
11227 RExC_contains_locale = 1;
11232 /* Parse backref decimal value, unless it's too big to sensibly be a backref,
11233 * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
11236 S_backref_value(char *p)
11238 const char* endptr;
11239 UV val = grok_atou(p, &endptr);
11240 if (endptr == p || endptr == NULL || val > I32_MAX)
11247 - regatom - the lowest level
11249 Try to identify anything special at the start of the pattern. If there
11250 is, then handle it as required. This may involve generating a single regop,
11251 such as for an assertion; or it may involve recursing, such as to
11252 handle a () structure.
11254 If the string doesn't start with something special then we gobble up
11255 as much literal text as we can.
11257 Once we have been able to handle whatever type of thing started the
11258 sequence, we return.
11260 Note: we have to be careful with escapes, as they can be both literal
11261 and special, and in the case of \10 and friends, context determines which.
11263 A summary of the code structure is:
11265 switch (first_byte) {
11266 cases for each special:
11267 handle this special;
11270 switch (2nd byte) {
11271 cases for each unambiguous special:
11272 handle this special;
11274 cases for each ambigous special/literal:
11276 if (special) handle here
11278 default: // unambiguously literal:
11281 default: // is a literal char
11284 create EXACTish node for literal;
11285 while (more input and node isn't full) {
11286 switch (input_byte) {
11287 cases for each special;
11288 make sure parse pointer is set so that the next call to
11289 regatom will see this special first
11290 goto loopdone; // EXACTish node terminated by prev. char
11292 append char to EXACTISH node;
11294 get next input byte;
11298 return the generated node;
11300 Specifically there are two separate switches for handling
11301 escape sequences, with the one for handling literal escapes requiring
11302 a dummy entry for all of the special escapes that are actually handled
11305 Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
11307 Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
11309 Otherwise does not return NULL.
11313 S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
11315 regnode *ret = NULL;
11317 char *parse_start = RExC_parse;
11322 GET_RE_DEBUG_FLAGS_DECL;
11324 *flagp = WORST; /* Tentatively. */
11326 DEBUG_PARSE("atom");
11328 PERL_ARGS_ASSERT_REGATOM;
11331 switch ((U8)*RExC_parse) {
11333 RExC_seen_zerolen++;
11334 nextchar(pRExC_state);
11335 if (RExC_flags & RXf_PMf_MULTILINE)
11336 ret = reg_node(pRExC_state, MBOL);
11338 ret = reg_node(pRExC_state, SBOL);
11339 Set_Node_Length(ret, 1); /* MJD */
11342 nextchar(pRExC_state);
11344 RExC_seen_zerolen++;
11345 if (RExC_flags & RXf_PMf_MULTILINE)
11346 ret = reg_node(pRExC_state, MEOL);
11348 ret = reg_node(pRExC_state, SEOL);
11349 Set_Node_Length(ret, 1); /* MJD */
11352 nextchar(pRExC_state);
11353 if (RExC_flags & RXf_PMf_SINGLELINE)
11354 ret = reg_node(pRExC_state, SANY);
11356 ret = reg_node(pRExC_state, REG_ANY);
11357 *flagp |= HASWIDTH|SIMPLE;
11359 Set_Node_Length(ret, 1); /* MJD */
11363 char * const oregcomp_parse = ++RExC_parse;
11364 ret = regclass(pRExC_state, flagp,depth+1,
11365 FALSE, /* means parse the whole char class */
11366 TRUE, /* allow multi-char folds */
11367 FALSE, /* don't silence non-portable warnings. */
11369 if (*RExC_parse != ']') {
11370 RExC_parse = oregcomp_parse;
11371 vFAIL("Unmatched [");
11374 if (*flagp & RESTART_UTF8)
11376 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11379 nextchar(pRExC_state);
11380 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
11384 nextchar(pRExC_state);
11385 ret = reg(pRExC_state, 2, &flags,depth+1);
11387 if (flags & TRYAGAIN) {
11388 if (RExC_parse == RExC_end) {
11389 /* Make parent create an empty node if needed. */
11390 *flagp |= TRYAGAIN;
11395 if (flags & RESTART_UTF8) {
11396 *flagp = RESTART_UTF8;
11399 FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
11402 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
11406 if (flags & TRYAGAIN) {
11407 *flagp |= TRYAGAIN;
11410 vFAIL("Internal urp");
11411 /* Supposed to be caught earlier. */
11417 vFAIL("Quantifier follows nothing");
11422 This switch handles escape sequences that resolve to some kind
11423 of special regop and not to literal text. Escape sequnces that
11424 resolve to literal text are handled below in the switch marked
11427 Every entry in this switch *must* have a corresponding entry
11428 in the literal escape switch. However, the opposite is not
11429 required, as the default for this switch is to jump to the
11430 literal text handling code.
11432 switch ((U8)*++RExC_parse) {
11433 /* Special Escapes */
11435 RExC_seen_zerolen++;
11436 ret = reg_node(pRExC_state, SBOL);
11437 /* SBOL is shared with /^/ so we set the flags so we can tell
11438 * /\A/ from /^/ in split. We check ret because first pass we
11439 * have no regop struct to set the flags on. */
11443 goto finish_meta_pat;
11445 ret = reg_node(pRExC_state, GPOS);
11446 RExC_seen |= REG_GPOS_SEEN;
11448 goto finish_meta_pat;
11450 RExC_seen_zerolen++;
11451 ret = reg_node(pRExC_state, KEEPS);
11453 /* XXX:dmq : disabling in-place substitution seems to
11454 * be necessary here to avoid cases of memory corruption, as
11455 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
11457 RExC_seen |= REG_LOOKBEHIND_SEEN;
11458 goto finish_meta_pat;
11460 ret = reg_node(pRExC_state, SEOL);
11462 RExC_seen_zerolen++; /* Do not optimize RE away */
11463 goto finish_meta_pat;
11465 ret = reg_node(pRExC_state, EOS);
11467 RExC_seen_zerolen++; /* Do not optimize RE away */
11468 goto finish_meta_pat;
11470 ret = reg_node(pRExC_state, CANY);
11471 RExC_seen |= REG_CANY_SEEN;
11472 *flagp |= HASWIDTH|SIMPLE;
11474 ckWARNdep(RExC_parse+1, "\\C is deprecated");
11476 goto finish_meta_pat;
11478 ret = reg_node(pRExC_state, CLUMP);
11479 *flagp |= HASWIDTH;
11480 goto finish_meta_pat;
11486 arg = ANYOF_WORDCHAR;
11490 RExC_seen_zerolen++;
11491 RExC_seen |= REG_LOOKBEHIND_SEEN;
11492 op = BOUND + get_regex_charset(RExC_flags);
11493 if (op > BOUNDA) { /* /aa is same as /a */
11496 else if (op == BOUNDL) {
11497 RExC_contains_locale = 1;
11499 ret = reg_node(pRExC_state, op);
11500 FLAGS(ret) = get_regex_charset(RExC_flags);
11502 if ((U8) *(RExC_parse + 1) == '{') {
11503 /* diag_listed_as: Use "%s" instead of "%s" */
11504 vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
11506 goto finish_meta_pat;
11508 RExC_seen_zerolen++;
11509 RExC_seen |= REG_LOOKBEHIND_SEEN;
11510 op = NBOUND + get_regex_charset(RExC_flags);
11511 if (op > NBOUNDA) { /* /aa is same as /a */
11514 else if (op == NBOUNDL) {
11515 RExC_contains_locale = 1;
11517 ret = reg_node(pRExC_state, op);
11518 FLAGS(ret) = get_regex_charset(RExC_flags);
11520 if ((U8) *(RExC_parse + 1) == '{') {
11521 /* diag_listed_as: Use "%s" instead of "%s" */
11522 vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
11524 goto finish_meta_pat;
11534 ret = reg_node(pRExC_state, LNBREAK);
11535 *flagp |= HASWIDTH|SIMPLE;
11536 goto finish_meta_pat;
11544 goto join_posix_op_known;
11550 arg = ANYOF_VERTWS;
11552 goto join_posix_op_known;
11562 op = POSIXD + get_regex_charset(RExC_flags);
11563 if (op > POSIXA) { /* /aa is same as /a */
11566 else if (op == POSIXL) {
11567 RExC_contains_locale = 1;
11570 join_posix_op_known:
11573 op += NPOSIXD - POSIXD;
11576 ret = reg_node(pRExC_state, op);
11578 FLAGS(ret) = namedclass_to_classnum(arg);
11581 *flagp |= HASWIDTH|SIMPLE;
11585 nextchar(pRExC_state);
11586 Set_Node_Length(ret, 2); /* MJD */
11592 char* parse_start = RExC_parse - 2;
11597 ret = regclass(pRExC_state, flagp,depth+1,
11598 TRUE, /* means just parse this element */
11599 FALSE, /* don't allow multi-char folds */
11600 FALSE, /* don't silence non-portable warnings.
11601 It would be a bug if these returned
11604 /* regclass() can only return RESTART_UTF8 if multi-char folds
11607 FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
11612 Set_Node_Offset(ret, parse_start + 2);
11613 Set_Node_Cur_Length(ret, parse_start);
11614 nextchar(pRExC_state);
11618 /* Handle \N and \N{NAME} with multiple code points here and not
11619 * below because it can be multicharacter. join_exact() will join
11620 * them up later on. Also this makes sure that things like
11621 * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
11622 * The options to the grok function call causes it to fail if the
11623 * sequence is just a single code point. We then go treat it as
11624 * just another character in the current EXACT node, and hence it
11625 * gets uniform treatment with all the other characters. The
11626 * special treatment for quantifiers is not needed for such single
11627 * character sequences */
11629 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
11632 if (*flagp & RESTART_UTF8)
11638 case 'k': /* Handle \k<NAME> and \k'NAME' */
11641 char ch= RExC_parse[1];
11642 if (ch != '<' && ch != '\'' && ch != '{') {
11644 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11645 vFAIL2("Sequence %.2s... not terminated",parse_start);
11647 /* this pretty much dupes the code for (?P=...) in reg(), if
11648 you change this make sure you change that */
11649 char* name_start = (RExC_parse += 2);
11651 SV *sv_dat = reg_scan_name(pRExC_state,
11652 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
11653 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
11654 if (RExC_parse == name_start || *RExC_parse != ch)
11655 /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
11656 vFAIL2("Sequence %.3s... not terminated",parse_start);
11659 num = add_data( pRExC_state, STR_WITH_LEN("S"));
11660 RExC_rxi->data->data[num]=(void*)sv_dat;
11661 SvREFCNT_inc_simple_void(sv_dat);
11665 ret = reganode(pRExC_state,
11668 : (ASCII_FOLD_RESTRICTED)
11670 : (AT_LEAST_UNI_SEMANTICS)
11676 *flagp |= HASWIDTH;
11678 /* override incorrect value set in reganode MJD */
11679 Set_Node_Offset(ret, parse_start+1);
11680 Set_Node_Cur_Length(ret, parse_start);
11681 nextchar(pRExC_state);
11687 case '1': case '2': case '3': case '4':
11688 case '5': case '6': case '7': case '8': case '9':
11693 if (*RExC_parse == 'g') {
11697 if (*RExC_parse == '{') {
11701 if (*RExC_parse == '-') {
11705 if (hasbrace && !isDIGIT(*RExC_parse)) {
11706 if (isrel) RExC_parse--;
11708 goto parse_named_seq;
11711 num = S_backref_value(RExC_parse);
11713 vFAIL("Reference to invalid group 0");
11714 else if (num == I32_MAX) {
11715 if (isDIGIT(*RExC_parse))
11716 vFAIL("Reference to nonexistent group");
11718 vFAIL("Unterminated \\g... pattern");
11722 num = RExC_npar - num;
11724 vFAIL("Reference to nonexistent or unclosed group");
11728 num = S_backref_value(RExC_parse);
11729 /* bare \NNN might be backref or octal - if it is larger than or equal
11730 * RExC_npar then it is assumed to be and octal escape.
11731 * Note RExC_npar is +1 from the actual number of parens*/
11732 if (num == I32_MAX || (num > 9 && num >= RExC_npar
11733 && *RExC_parse != '8' && *RExC_parse != '9'))
11735 /* Probably a character specified in octal, e.g. \35 */
11740 /* at this point RExC_parse definitely points to a backref
11743 #ifdef RE_TRACK_PATTERN_OFFSETS
11744 char * const parse_start = RExC_parse - 1; /* MJD */
11746 while (isDIGIT(*RExC_parse))
11749 if (*RExC_parse != '}')
11750 vFAIL("Unterminated \\g{...} pattern");
11754 if (num > (I32)RExC_rx->nparens)
11755 vFAIL("Reference to nonexistent group");
11758 ret = reganode(pRExC_state,
11761 : (ASCII_FOLD_RESTRICTED)
11763 : (AT_LEAST_UNI_SEMANTICS)
11769 *flagp |= HASWIDTH;
11771 /* override incorrect value set in reganode MJD */
11772 Set_Node_Offset(ret, parse_start+1);
11773 Set_Node_Cur_Length(ret, parse_start);
11775 nextchar(pRExC_state);
11780 if (RExC_parse >= RExC_end)
11781 FAIL("Trailing \\");
11784 /* Do not generate "unrecognized" warnings here, we fall
11785 back into the quick-grab loop below */
11792 if (RExC_flags & RXf_PMf_EXTENDED) {
11793 RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
11794 if (RExC_parse < RExC_end)
11801 parse_start = RExC_parse - 1;
11810 #define MAX_NODE_STRING_SIZE 127
11811 char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
11813 U8 upper_parse = MAX_NODE_STRING_SIZE;
11814 U8 node_type = compute_EXACTish(pRExC_state);
11815 bool next_is_quantifier;
11816 char * oldp = NULL;
11818 /* We can convert EXACTF nodes to EXACTFU if they contain only
11819 * characters that match identically regardless of the target
11820 * string's UTF8ness. The reason to do this is that EXACTF is not
11821 * trie-able, EXACTFU is.
11823 * Similarly, we can convert EXACTFL nodes to EXACTFU if they
11824 * contain only above-Latin1 characters (hence must be in UTF8),
11825 * which don't participate in folds with Latin1-range characters,
11826 * as the latter's folds aren't known until runtime. (We don't
11827 * need to figure this out until pass 2) */
11828 bool maybe_exactfu = PASS2
11829 && (node_type == EXACTF || node_type == EXACTFL);
11831 /* If a folding node contains only code points that don't
11832 * participate in folds, it can be changed into an EXACT node,
11833 * which allows the optimizer more things to look for */
11836 ret = reg_node(pRExC_state, node_type);
11838 /* In pass1, folded, we use a temporary buffer instead of the
11839 * actual node, as the node doesn't exist yet */
11840 s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
11846 /* We do the EXACTFish to EXACT node only if folding. (And we
11847 * don't need to figure this out until pass 2) */
11848 maybe_exact = FOLD && PASS2;
11850 /* XXX The node can hold up to 255 bytes, yet this only goes to
11851 * 127. I (khw) do not know why. Keeping it somewhat less than
11852 * 255 allows us to not have to worry about overflow due to
11853 * converting to utf8 and fold expansion, but that value is
11854 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
11855 * split up by this limit into a single one using the real max of
11856 * 255. Even at 127, this breaks under rare circumstances. If
11857 * folding, we do not want to split a node at a character that is a
11858 * non-final in a multi-char fold, as an input string could just
11859 * happen to want to match across the node boundary. The join
11860 * would solve that problem if the join actually happens. But a
11861 * series of more than two nodes in a row each of 127 would cause
11862 * the first join to succeed to get to 254, but then there wouldn't
11863 * be room for the next one, which could at be one of those split
11864 * multi-char folds. I don't know of any fool-proof solution. One
11865 * could back off to end with only a code point that isn't such a
11866 * non-final, but it is possible for there not to be any in the
11868 for (p = RExC_parse - 1;
11869 len < upper_parse && p < RExC_end;
11874 if (RExC_flags & RXf_PMf_EXTENDED)
11875 p = regpatws(pRExC_state, p,
11876 TRUE); /* means recognize comments */
11887 /* Literal Escapes Switch
11889 This switch is meant to handle escape sequences that
11890 resolve to a literal character.
11892 Every escape sequence that represents something
11893 else, like an assertion or a char class, is handled
11894 in the switch marked 'Special Escapes' above in this
11895 routine, but also has an entry here as anything that
11896 isn't explicitly mentioned here will be treated as
11897 an unescaped equivalent literal.
11900 switch ((U8)*++p) {
11901 /* These are all the special escapes. */
11902 case 'A': /* Start assertion */
11903 case 'b': case 'B': /* Word-boundary assertion*/
11904 case 'C': /* Single char !DANGEROUS! */
11905 case 'd': case 'D': /* digit class */
11906 case 'g': case 'G': /* generic-backref, pos assertion */
11907 case 'h': case 'H': /* HORIZWS */
11908 case 'k': case 'K': /* named backref, keep marker */
11909 case 'p': case 'P': /* Unicode property */
11910 case 'R': /* LNBREAK */
11911 case 's': case 'S': /* space class */
11912 case 'v': case 'V': /* VERTWS */
11913 case 'w': case 'W': /* word class */
11914 case 'X': /* eXtended Unicode "combining
11915 character sequence" */
11916 case 'z': case 'Z': /* End of line/string assertion */
11920 /* Anything after here is an escape that resolves to a
11921 literal. (Except digits, which may or may not)
11927 case 'N': /* Handle a single-code point named character. */
11928 /* The options cause it to fail if a multiple code
11929 * point sequence. Handle those in the switch() above
11931 RExC_parse = p + 1;
11932 if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
11938 if (*flagp & RESTART_UTF8)
11939 FAIL("panic: grok_bslash_N set RESTART_UTF8");
11940 RExC_parse = p = oldp;
11944 if (ender > 0xff) {
11961 ender = ESC_NATIVE;
11971 const char* error_msg;
11973 bool valid = grok_bslash_o(&p,
11976 PASS2, /* out warnings */
11977 FALSE, /* not strict */
11978 TRUE, /* Output warnings
11983 RExC_parse = p; /* going to die anyway; point
11984 to exact spot of failure */
11988 if (PL_encoding && ender < 0x100) {
11989 goto recode_encoding;
11991 if (ender > 0xff) {
11998 UV result = UV_MAX; /* initialize to erroneous
12000 const char* error_msg;
12002 bool valid = grok_bslash_x(&p,
12005 PASS2, /* out warnings */
12006 FALSE, /* not strict */
12007 TRUE, /* Output warnings
12012 RExC_parse = p; /* going to die anyway; point
12013 to exact spot of failure */
12018 if (PL_encoding && ender < 0x100) {
12019 goto recode_encoding;
12021 if (ender > 0xff) {
12028 ender = grok_bslash_c(*p++, PASS2);
12030 case '8': case '9': /* must be a backreference */
12033 case '1': case '2': case '3':case '4':
12034 case '5': case '6': case '7':
12035 /* When we parse backslash escapes there is ambiguity
12036 * between backreferences and octal escapes. Any escape
12037 * from \1 - \9 is a backreference, any multi-digit
12038 * escape which does not start with 0 and which when
12039 * evaluated as decimal could refer to an already
12040 * parsed capture buffer is a backslash. Anything else
12043 * Note this implies that \118 could be interpreted as
12044 * 118 OR as "\11" . "8" depending on whether there
12045 * were 118 capture buffers defined already in the
12048 /* NOTE, RExC_npar is 1 more than the actual number of
12049 * parens we have seen so far, hence the < RExC_npar below. */
12051 if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
12052 { /* Not to be treated as an octal constant, go
12060 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
12062 ender = grok_oct(p, &numlen, &flags, NULL);
12063 if (ender > 0xff) {
12067 if (PASS2 /* like \08, \178 */
12070 && isDIGIT(*p) && ckWARN(WARN_REGEXP))
12072 reg_warn_non_literal_string(
12074 form_short_octal_warning(p, numlen));
12077 if (PL_encoding && ender < 0x100)
12078 goto recode_encoding;
12081 if (! RExC_override_recoding) {
12082 SV* enc = PL_encoding;
12083 ender = reg_recode((const char)(U8)ender, &enc);
12085 ckWARNreg(p, "Invalid escape in the specified encoding");
12091 FAIL("Trailing \\");
12094 if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
12095 /* Include any { following the alpha to emphasize
12096 * that it could be part of an escape at some point
12098 int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
12099 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
12101 goto normal_default;
12102 } /* End of switch on '\' */
12105 /* Currently we don't warn when the lbrace is at the start
12106 * of a construct. This catches it in the middle of a
12107 * literal string, or when its the first thing after
12108 * something like "\b" */
12110 && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
12112 ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
12115 default: /* A literal character */
12117 if (UTF8_IS_START(*p) && UTF) {
12119 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
12120 &numlen, UTF8_ALLOW_DEFAULT);
12126 } /* End of switch on the literal */
12128 /* Here, have looked at the literal character and <ender>
12129 * contains its ordinal, <p> points to the character after it
12132 if ( RExC_flags & RXf_PMf_EXTENDED)
12133 p = regpatws(pRExC_state, p,
12134 TRUE); /* means recognize comments */
12136 /* If the next thing is a quantifier, it applies to this
12137 * character only, which means that this character has to be in
12138 * its own node and can't just be appended to the string in an
12139 * existing node, so if there are already other characters in
12140 * the node, close the node with just them, and set up to do
12141 * this character again next time through, when it will be the
12142 * only thing in its new node */
12143 if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
12149 if (! FOLD /* The simple case, just append the literal */
12150 || (LOC /* Also don't fold for tricky chars under /l */
12151 && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
12154 const STRLEN unilen = reguni(pRExC_state, ender, s);
12160 /* The loop increments <len> each time, as all but this
12161 * path (and one other) through it add a single byte to
12162 * the EXACTish node. But this one has changed len to
12163 * be the correct final value, so subtract one to
12164 * cancel out the increment that follows */
12168 REGC((char)ender, s++);
12171 /* Can get here if folding only if is one of the /l
12172 * characters whose fold depends on the locale. The
12173 * occurrence of any of these indicate that we can't
12174 * simplify things */
12176 maybe_exact = FALSE;
12177 maybe_exactfu = FALSE;
12182 /* See comments for join_exact() as to why we fold this
12183 * non-UTF at compile time */
12184 || (node_type == EXACTFU
12185 && ender == LATIN_SMALL_LETTER_SHARP_S)))
12187 /* Here, are folding and are not UTF-8 encoded; therefore
12188 * the character must be in the range 0-255, and is not /l
12189 * (Not /l because we already handled these under /l in
12190 * is_PROBLEMATIC_LOCALE_FOLD_cp */
12191 if (IS_IN_SOME_FOLD_L1(ender)) {
12192 maybe_exact = FALSE;
12194 /* See if the character's fold differs between /d and
12195 * /u. This includes the multi-char fold SHARP S to
12198 && (PL_fold[ender] != PL_fold_latin1[ender]
12199 || ender == LATIN_SMALL_LETTER_SHARP_S
12201 && isALPHA_FOLD_EQ(ender, 's')
12202 && isALPHA_FOLD_EQ(*(s-1), 's'))))
12204 maybe_exactfu = FALSE;
12208 /* Even when folding, we store just the input character, as
12209 * we have an array that finds its fold quickly */
12210 *(s++) = (char) ender;
12212 else { /* FOLD and UTF */
12213 /* Unlike the non-fold case, we do actually have to
12214 * calculate the results here in pass 1. This is for two
12215 * reasons, the folded length may be longer than the
12216 * unfolded, and we have to calculate how many EXACTish
12217 * nodes it will take; and we may run out of room in a node
12218 * in the middle of a potential multi-char fold, and have
12219 * to back off accordingly. (Hence we can't use REGC for
12220 * the simple case just below.) */
12223 if (isASCII(ender)) {
12224 folded = toFOLD(ender);
12225 *(s)++ = (U8) folded;
12230 folded = _to_uni_fold_flags(
12234 FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
12235 ? FOLD_FLAGS_NOMIX_ASCII
12239 /* The loop increments <len> each time, as all but this
12240 * path (and one other) through it add a single byte to
12241 * the EXACTish node. But this one has changed len to
12242 * be the correct final value, so subtract one to
12243 * cancel out the increment that follows */
12244 len += foldlen - 1;
12246 /* If this node only contains non-folding code points so
12247 * far, see if this new one is also non-folding */
12249 if (folded != ender) {
12250 maybe_exact = FALSE;
12253 /* Here the fold is the original; we have to check
12254 * further to see if anything folds to it */
12255 if (_invlist_contains_cp(PL_utf8_foldable,
12258 maybe_exact = FALSE;
12265 if (next_is_quantifier) {
12267 /* Here, the next input is a quantifier, and to get here,
12268 * the current character is the only one in the node.
12269 * Also, here <len> doesn't include the final byte for this
12275 } /* End of loop through literal characters */
12277 /* Here we have either exhausted the input or ran out of room in
12278 * the node. (If we encountered a character that can't be in the
12279 * node, transfer is made directly to <loopdone>, and so we
12280 * wouldn't have fallen off the end of the loop.) In the latter
12281 * case, we artificially have to split the node into two, because
12282 * we just don't have enough space to hold everything. This
12283 * creates a problem if the final character participates in a
12284 * multi-character fold in the non-final position, as a match that
12285 * should have occurred won't, due to the way nodes are matched,
12286 * and our artificial boundary. So back off until we find a non-
12287 * problematic character -- one that isn't at the beginning or
12288 * middle of such a fold. (Either it doesn't participate in any
12289 * folds, or appears only in the final position of all the folds it
12290 * does participate in.) A better solution with far fewer false
12291 * positives, and that would fill the nodes more completely, would
12292 * be to actually have available all the multi-character folds to
12293 * test against, and to back-off only far enough to be sure that
12294 * this node isn't ending with a partial one. <upper_parse> is set
12295 * further below (if we need to reparse the node) to include just
12296 * up through that final non-problematic character that this code
12297 * identifies, so when it is set to less than the full node, we can
12298 * skip the rest of this */
12299 if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
12301 const STRLEN full_len = len;
12303 assert(len >= MAX_NODE_STRING_SIZE);
12305 /* Here, <s> points to the final byte of the final character.
12306 * Look backwards through the string until find a non-
12307 * problematic character */
12311 /* This has no multi-char folds to non-UTF characters */
12312 if (ASCII_FOLD_RESTRICTED) {
12316 while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
12320 if (! PL_NonL1NonFinalFold) {
12321 PL_NonL1NonFinalFold = _new_invlist_C_array(
12322 NonL1_Perl_Non_Final_Folds_invlist);
12325 /* Point to the first byte of the final character */
12326 s = (char *) utf8_hop((U8 *) s, -1);
12328 while (s >= s0) { /* Search backwards until find
12329 non-problematic char */
12330 if (UTF8_IS_INVARIANT(*s)) {
12332 /* There are no ascii characters that participate
12333 * in multi-char folds under /aa. In EBCDIC, the
12334 * non-ascii invariants are all control characters,
12335 * so don't ever participate in any folds. */
12336 if (ASCII_FOLD_RESTRICTED
12337 || ! IS_NON_FINAL_FOLD(*s))
12342 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
12343 if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
12349 else if (! _invlist_contains_cp(
12350 PL_NonL1NonFinalFold,
12351 valid_utf8_to_uvchr((U8 *) s, NULL)))
12356 /* Here, the current character is problematic in that
12357 * it does occur in the non-final position of some
12358 * fold, so try the character before it, but have to
12359 * special case the very first byte in the string, so
12360 * we don't read outside the string */
12361 s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
12362 } /* End of loop backwards through the string */
12364 /* If there were only problematic characters in the string,
12365 * <s> will point to before s0, in which case the length
12366 * should be 0, otherwise include the length of the
12367 * non-problematic character just found */
12368 len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
12371 /* Here, have found the final character, if any, that is
12372 * non-problematic as far as ending the node without splitting
12373 * it across a potential multi-char fold. <len> contains the
12374 * number of bytes in the node up-to and including that
12375 * character, or is 0 if there is no such character, meaning
12376 * the whole node contains only problematic characters. In
12377 * this case, give up and just take the node as-is. We can't
12382 /* If the node ends in an 's' we make sure it stays EXACTF,
12383 * as if it turns into an EXACTFU, it could later get
12384 * joined with another 's' that would then wrongly match
12386 if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
12388 maybe_exactfu = FALSE;
12392 /* Here, the node does contain some characters that aren't
12393 * problematic. If one such is the final character in the
12394 * node, we are done */
12395 if (len == full_len) {
12398 else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
12400 /* If the final character is problematic, but the
12401 * penultimate is not, back-off that last character to
12402 * later start a new node with it */
12407 /* Here, the final non-problematic character is earlier
12408 * in the input than the penultimate character. What we do
12409 * is reparse from the beginning, going up only as far as
12410 * this final ok one, thus guaranteeing that the node ends
12411 * in an acceptable character. The reason we reparse is
12412 * that we know how far in the character is, but we don't
12413 * know how to correlate its position with the input parse.
12414 * An alternate implementation would be to build that
12415 * correlation as we go along during the original parse,
12416 * but that would entail extra work for every node, whereas
12417 * this code gets executed only when the string is too
12418 * large for the node, and the final two characters are
12419 * problematic, an infrequent occurrence. Yet another
12420 * possible strategy would be to save the tail of the
12421 * string, and the next time regatom is called, initialize
12422 * with that. The problem with this is that unless you
12423 * back off one more character, you won't be guaranteed
12424 * regatom will get called again, unless regbranch,
12425 * regpiece ... are also changed. If you do back off that
12426 * extra character, so that there is input guaranteed to
12427 * force calling regatom, you can't handle the case where
12428 * just the first character in the node is acceptable. I
12429 * (khw) decided to try this method which doesn't have that
12430 * pitfall; if performance issues are found, we can do a
12431 * combination of the current approach plus that one */
12437 } /* End of verifying node ends with an appropriate char */
12439 loopdone: /* Jumped to when encounters something that shouldn't be in
12442 /* I (khw) don't know if you can get here with zero length, but the
12443 * old code handled this situation by creating a zero-length EXACT
12444 * node. Might as well be NOTHING instead */
12450 /* If 'maybe_exact' is still set here, means there are no
12451 * code points in the node that participate in folds;
12452 * similarly for 'maybe_exactfu' and code points that match
12453 * differently depending on UTF8ness of the target string
12454 * (for /u), or depending on locale for /l */
12458 else if (maybe_exactfu) {
12462 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
12463 FALSE /* Don't look to see if could
12464 be turned into an EXACT
12465 node, as we have already
12470 RExC_parse = p - 1;
12471 Set_Node_Cur_Length(ret, parse_start);
12472 nextchar(pRExC_state);
12474 /* len is STRLEN which is unsigned, need to copy to signed */
12477 vFAIL("Internal disaster");
12480 } /* End of label 'defchar:' */
12482 } /* End of giant switch on input character */
12488 S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
12490 /* Returns the next non-pattern-white space, non-comment character (the
12491 * latter only if 'recognize_comment is true) in the string p, which is
12492 * ended by RExC_end. See also reg_skipcomment */
12493 const char *e = RExC_end;
12495 PERL_ARGS_ASSERT_REGPATWS;
12499 if ((len = is_PATWS_safe(p, e, UTF))) {
12502 else if (recognize_comment && *p == '#') {
12503 p = reg_skipcomment(pRExC_state, p);
12512 S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
12514 /* Uses the inversion list '*invlist_ptr' to populate the ANYOF 'node'. It
12515 * sets up the bitmap and any flags, removing those code points from the
12516 * inversion list, setting it to NULL should it become completely empty */
12518 PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST;
12519 assert(PL_regkind[OP(node)] == ANYOF);
12521 ANYOF_BITMAP_ZERO(node);
12522 if (*invlist_ptr) {
12524 /* This gets set if we actually need to modify things */
12525 bool change_invlist = FALSE;
12529 /* Start looking through *invlist_ptr */
12530 invlist_iterinit(*invlist_ptr);
12531 while (invlist_iternext(*invlist_ptr, &start, &end)) {
12535 if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
12536 ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
12538 else if (end >= NUM_ANYOF_CODE_POINTS) {
12539 ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
12542 /* Quit if are above what we should change */
12543 if (start >= NUM_ANYOF_CODE_POINTS) {
12547 change_invlist = TRUE;
12549 /* Set all the bits in the range, up to the max that we are doing */
12550 high = (end < NUM_ANYOF_CODE_POINTS - 1)
12552 : NUM_ANYOF_CODE_POINTS - 1;
12553 for (i = start; i <= (int) high; i++) {
12554 if (! ANYOF_BITMAP_TEST(node, i)) {
12555 ANYOF_BITMAP_SET(node, i);
12559 invlist_iterfinish(*invlist_ptr);
12561 /* Done with loop; remove any code points that are in the bitmap from
12562 * *invlist_ptr; similarly for code points above the bitmap if we have
12563 * a flag to match all of them anyways */
12564 if (change_invlist) {
12565 _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr);
12567 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
12568 _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr);
12571 /* If have completely emptied it, remove it completely */
12572 if (_invlist_len(*invlist_ptr) == 0) {
12573 SvREFCNT_dec_NN(*invlist_ptr);
12574 *invlist_ptr = NULL;
12579 /* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
12580 Character classes ([:foo:]) can also be negated ([:^foo:]).
12581 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
12582 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
12583 but trigger failures because they are currently unimplemented. */
12585 #define POSIXCC_DONE(c) ((c) == ':')
12586 #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
12587 #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
12589 PERL_STATIC_INLINE I32
12590 S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
12592 I32 namedclass = OOB_NAMEDCLASS;
12594 PERL_ARGS_ASSERT_REGPPOSIXCC;
12596 if (value == '[' && RExC_parse + 1 < RExC_end &&
12597 /* I smell either [: or [= or [. -- POSIX has been here, right? */
12598 POSIXCC(UCHARAT(RExC_parse)))
12600 const char c = UCHARAT(RExC_parse);
12601 char* const s = RExC_parse++;
12603 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
12605 if (RExC_parse == RExC_end) {
12608 /* Try to give a better location for the error (than the end of
12609 * the string) by looking for the matching ']' */
12611 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
12614 vFAIL2("Unmatched '%c' in POSIX class", c);
12616 /* Grandfather lone [:, [=, [. */
12620 const char* const t = RExC_parse++; /* skip over the c */
12623 if (UCHARAT(RExC_parse) == ']') {
12624 const char *posixcc = s + 1;
12625 RExC_parse++; /* skip over the ending ] */
12628 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
12629 const I32 skip = t - posixcc;
12631 /* Initially switch on the length of the name. */
12634 if (memEQ(posixcc, "word", 4)) /* this is not POSIX,
12635 this is the Perl \w
12637 namedclass = ANYOF_WORDCHAR;
12640 /* Names all of length 5. */
12641 /* alnum alpha ascii blank cntrl digit graph lower
12642 print punct space upper */
12643 /* Offset 4 gives the best switch position. */
12644 switch (posixcc[4]) {
12646 if (memEQ(posixcc, "alph", 4)) /* alpha */
12647 namedclass = ANYOF_ALPHA;
12650 if (memEQ(posixcc, "spac", 4)) /* space */
12651 namedclass = ANYOF_PSXSPC;
12654 if (memEQ(posixcc, "grap", 4)) /* graph */
12655 namedclass = ANYOF_GRAPH;
12658 if (memEQ(posixcc, "asci", 4)) /* ascii */
12659 namedclass = ANYOF_ASCII;
12662 if (memEQ(posixcc, "blan", 4)) /* blank */
12663 namedclass = ANYOF_BLANK;
12666 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
12667 namedclass = ANYOF_CNTRL;
12670 if (memEQ(posixcc, "alnu", 4)) /* alnum */
12671 namedclass = ANYOF_ALPHANUMERIC;
12674 if (memEQ(posixcc, "lowe", 4)) /* lower */
12675 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
12676 else if (memEQ(posixcc, "uppe", 4)) /* upper */
12677 namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
12680 if (memEQ(posixcc, "digi", 4)) /* digit */
12681 namedclass = ANYOF_DIGIT;
12682 else if (memEQ(posixcc, "prin", 4)) /* print */
12683 namedclass = ANYOF_PRINT;
12684 else if (memEQ(posixcc, "punc", 4)) /* punct */
12685 namedclass = ANYOF_PUNCT;
12690 if (memEQ(posixcc, "xdigit", 6))
12691 namedclass = ANYOF_XDIGIT;
12695 if (namedclass == OOB_NAMEDCLASS)
12697 "POSIX class [:%"UTF8f":] unknown",
12698 UTF8fARG(UTF, t - s - 1, s + 1));
12700 /* The #defines are structured so each complement is +1 to
12701 * the normal one */
12705 assert (posixcc[skip] == ':');
12706 assert (posixcc[skip+1] == ']');
12707 } else if (!SIZE_ONLY) {
12708 /* [[=foo=]] and [[.foo.]] are still future. */
12710 /* adjust RExC_parse so the warning shows after
12711 the class closes */
12712 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
12714 vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
12717 /* Maternal grandfather:
12718 * "[:" ending in ":" but not in ":]" */
12720 vFAIL("Unmatched '[' in POSIX class");
12723 /* Grandfather lone [:, [=, [. */
12733 S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
12735 /* This applies some heuristics at the current parse position (which should
12736 * be at a '[') to see if what follows might be intended to be a [:posix:]
12737 * class. It returns true if it really is a posix class, of course, but it
12738 * also can return true if it thinks that what was intended was a posix
12739 * class that didn't quite make it.
12741 * It will return true for
12743 * [:alphanumerics] (as long as the ] isn't followed immediately by a
12744 * ')' indicating the end of the (?[
12745 * [:any garbage including %^&$ punctuation:]
12747 * This is designed to be called only from S_handle_regex_sets; it could be
12748 * easily adapted to be called from the spot at the beginning of regclass()
12749 * that checks to see in a normal bracketed class if the surrounding []
12750 * have been omitted ([:word:] instead of [[:word:]]). But doing so would
12751 * change long-standing behavior, so I (khw) didn't do that */
12752 char* p = RExC_parse + 1;
12753 char first_char = *p;
12755 PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS;
12757 assert(*(p - 1) == '[');
12759 if (! POSIXCC(first_char)) {
12764 while (p < RExC_end && isWORDCHAR(*p)) p++;
12766 if (p >= RExC_end) {
12770 if (p - RExC_parse > 2 /* Got at least 1 word character */
12771 && (*p == first_char
12772 || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')')))
12777 p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse);
12780 && p - RExC_parse > 2 /* [:] evaluates to colon;
12781 [::] is a bad posix class. */
12782 && first_char == *(p - 1));
12786 S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
12787 I32 *flagp, U32 depth,
12788 char * const oregcomp_parse)
12790 /* Handle the (?[...]) construct to do set operations */
12793 UV start, end; /* End points of code point ranges */
12795 char *save_end, *save_parse;
12800 const bool save_fold = FOLD;
12802 GET_RE_DEBUG_FLAGS_DECL;
12804 PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
12807 vFAIL("(?[...]) not valid in locale");
12809 RExC_uni_semantics = 1;
12811 /* This will return only an ANYOF regnode, or (unlikely) something smaller
12812 * (such as EXACT). Thus we can skip most everything if just sizing. We
12813 * call regclass to handle '[]' so as to not have to reinvent its parsing
12814 * rules here (throwing away the size it computes each time). And, we exit
12815 * upon an unescaped ']' that isn't one ending a regclass. To do both
12816 * these things, we need to realize that something preceded by a backslash
12817 * is escaped, so we have to keep track of backslashes */
12819 Perl_ck_warner_d(aTHX_
12820 packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
12821 "The regex_sets feature is experimental" REPORT_LOCATION,
12822 UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
12824 RExC_end - RExC_start - (RExC_parse - RExC_precomp),
12825 RExC_precomp + (RExC_parse - RExC_precomp)));
12828 UV depth = 0; /* how many nested (?[...]) constructs */
12830 while (RExC_parse < RExC_end) {
12831 SV* current = NULL;
12832 RExC_parse = regpatws(pRExC_state, RExC_parse,
12833 TRUE); /* means recognize comments */
12834 switch (*RExC_parse) {
12836 if (RExC_parse[1] == '[') depth++, RExC_parse++;
12841 /* Skip the next byte (which could cause us to end up in
12842 * the middle of a UTF-8 character, but since none of those
12843 * are confusable with anything we currently handle in this
12844 * switch (invariants all), it's safe. We'll just hit the
12845 * default: case next time and keep on incrementing until
12846 * we find one of the invariants we do handle. */
12851 /* If this looks like it is a [:posix:] class, leave the
12852 * parse pointer at the '[' to fool regclass() into
12853 * thinking it is part of a '[[:posix:]]'. That function
12854 * will use strict checking to force a syntax error if it
12855 * doesn't work out to a legitimate class */
12856 bool is_posix_class
12857 = could_it_be_a_POSIX_class(pRExC_state);
12858 if (! is_posix_class) {
12862 /* regclass() can only return RESTART_UTF8 if multi-char
12863 folds are allowed. */
12864 if (!regclass(pRExC_state, flagp,depth+1,
12865 is_posix_class, /* parse the whole char
12866 class only if not a
12868 FALSE, /* don't allow multi-char folds */
12869 TRUE, /* silence non-portable warnings. */
12871 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
12874 /* function call leaves parse pointing to the ']', except
12875 * if we faked it */
12876 if (is_posix_class) {
12880 SvREFCNT_dec(current); /* In case it returned something */
12885 if (depth--) break;
12887 if (RExC_parse < RExC_end
12888 && *RExC_parse == ')')
12890 node = reganode(pRExC_state, ANYOF, 0);
12891 RExC_size += ANYOF_SKIP;
12892 nextchar(pRExC_state);
12893 Set_Node_Length(node,
12894 RExC_parse - oregcomp_parse + 1); /* MJD */
12903 FAIL("Syntax error in (?[...])");
12906 /* Pass 2 only after this. Everything in this construct is a
12907 * metacharacter. Operands begin with either a '\' (for an escape
12908 * sequence), or a '[' for a bracketed character class. Any other
12909 * character should be an operator, or parenthesis for grouping. Both
12910 * types of operands are handled by calling regclass() to parse them. It
12911 * is called with a parameter to indicate to return the computed inversion
12912 * list. The parsing here is implemented via a stack. Each entry on the
12913 * stack is a single character representing one of the operators, or the
12914 * '('; or else a pointer to an operand inversion list. */
12916 #define IS_OPERAND(a) (! SvIOK(a))
12918 /* The stack starts empty. It is a syntax error if the first thing parsed
12919 * is a binary operator; everything else is pushed on the stack. When an
12920 * operand is parsed, the top of the stack is examined. If it is a binary
12921 * operator, the item before it should be an operand, and both are replaced
12922 * by the result of doing that operation on the new operand and the one on
12923 * the stack. Thus a sequence of binary operands is reduced to a single
12924 * one before the next one is parsed.
12926 * A unary operator may immediately follow a binary in the input, for
12929 * When an operand is parsed and the top of the stack is a unary operator,
12930 * the operation is performed, and then the stack is rechecked to see if
12931 * this new operand is part of a binary operation; if so, it is handled as
12934 * A '(' is simply pushed on the stack; it is valid only if the stack is
12935 * empty, or the top element of the stack is an operator or another '('
12936 * (for which the parenthesized expression will become an operand). By the
12937 * time the corresponding ')' is parsed everything in between should have
12938 * been parsed and evaluated to a single operand (or else is a syntax
12939 * error), and is handled as a regular operand */
12941 sv_2mortal((SV *)(stack = newAV()));
12943 while (RExC_parse < RExC_end) {
12944 I32 top_index = av_tindex(stack);
12946 SV* current = NULL;
12948 /* Skip white space */
12949 RExC_parse = regpatws(pRExC_state, RExC_parse,
12950 TRUE /* means recognize comments */ );
12951 if (RExC_parse >= RExC_end) {
12952 Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
12954 if ((curchar = UCHARAT(RExC_parse)) == ']') {
12961 if (av_tindex(stack) >= 0 /* This makes sure that we can
12962 safely subtract 1 from
12963 RExC_parse in the next clause.
12964 If we have something on the
12965 stack, we have parsed something
12967 && UCHARAT(RExC_parse - 1) == '('
12968 && RExC_parse < RExC_end)
12970 /* If is a '(?', could be an embedded '(?flags:(?[...])'.
12971 * This happens when we have some thing like
12973 * my $thai_or_lao = qr/(?[ \p{Thai} + \p{Lao} ])/;
12975 * qr/(?[ \p{Digit} & $thai_or_lao ])/;
12977 * Here we would be handling the interpolated
12978 * '$thai_or_lao'. We handle this by a recursive call to
12979 * ourselves which returns the inversion list the
12980 * interpolated expression evaluates to. We use the flags
12981 * from the interpolated pattern. */
12982 U32 save_flags = RExC_flags;
12983 const char * const save_parse = ++RExC_parse;
12985 parse_lparen_question_flags(pRExC_state);
12987 if (RExC_parse == save_parse /* Makes sure there was at
12988 least one flag (or this
12989 embedding wasn't compiled)
12991 || RExC_parse >= RExC_end - 4
12992 || UCHARAT(RExC_parse) != ':'
12993 || UCHARAT(++RExC_parse) != '('
12994 || UCHARAT(++RExC_parse) != '?'
12995 || UCHARAT(++RExC_parse) != '[')
12998 /* In combination with the above, this moves the
12999 * pointer to the point just after the first erroneous
13000 * character (or if there are no flags, to where they
13001 * should have been) */
13002 if (RExC_parse >= RExC_end - 4) {
13003 RExC_parse = RExC_end;
13005 else if (RExC_parse != save_parse) {
13006 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13008 vFAIL("Expecting '(?flags:(?[...'");
13011 (void) handle_regex_sets(pRExC_state, ¤t, flagp,
13012 depth+1, oregcomp_parse);
13014 /* Here, 'current' contains the embedded expression's
13015 * inversion list, and RExC_parse points to the trailing
13016 * ']'; the next character should be the ')' which will be
13017 * paired with the '(' that has been put on the stack, so
13018 * the whole embedded expression reduces to '(operand)' */
13021 RExC_flags = save_flags;
13022 goto handle_operand;
13027 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13028 vFAIL("Unexpected character");
13031 /* regclass() can only return RESTART_UTF8 if multi-char
13032 folds are allowed. */
13033 if (!regclass(pRExC_state, flagp,depth+1,
13034 TRUE, /* means parse just the next thing */
13035 FALSE, /* don't allow multi-char folds */
13036 FALSE, /* don't silence non-portable warnings. */
13038 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13040 /* regclass() will return with parsing just the \ sequence,
13041 * leaving the parse pointer at the next thing to parse */
13043 goto handle_operand;
13045 case '[': /* Is a bracketed character class */
13047 bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state);
13049 if (! is_posix_class) {
13053 /* regclass() can only return RESTART_UTF8 if multi-char
13054 folds are allowed. */
13055 if(!regclass(pRExC_state, flagp,depth+1,
13056 is_posix_class, /* parse the whole char class
13057 only if not a posix class */
13058 FALSE, /* don't allow multi-char folds */
13059 FALSE, /* don't silence non-portable warnings. */
13061 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
13063 /* function call leaves parse pointing to the ']', except if we
13065 if (is_posix_class) {
13069 goto handle_operand;
13078 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
13079 || ! IS_OPERAND(*top_ptr))
13082 vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
13084 av_push(stack, newSVuv(curchar));
13088 av_push(stack, newSVuv(curchar));
13092 if (top_index >= 0) {
13093 top_ptr = av_fetch(stack, top_index, FALSE);
13095 if (IS_OPERAND(*top_ptr)) {
13097 vFAIL("Unexpected '(' with no preceding operator");
13100 av_push(stack, newSVuv(curchar));
13107 || ! (current = av_pop(stack))
13108 || ! IS_OPERAND(current)
13109 || ! (lparen = av_pop(stack))
13110 || IS_OPERAND(lparen)
13111 || SvUV(lparen) != '(')
13113 SvREFCNT_dec(current);
13115 vFAIL("Unexpected ')'");
13118 SvREFCNT_dec_NN(lparen);
13125 /* Here, we have an operand to process, in 'current' */
13127 if (top_index < 0) { /* Just push if stack is empty */
13128 av_push(stack, current);
13131 SV* top = av_pop(stack);
13133 char current_operator;
13135 if (IS_OPERAND(top)) {
13136 SvREFCNT_dec_NN(top);
13137 SvREFCNT_dec_NN(current);
13138 vFAIL("Operand with no preceding operator");
13140 current_operator = (char) SvUV(top);
13141 switch (current_operator) {
13142 case '(': /* Push the '(' back on followed by the new
13144 av_push(stack, top);
13145 av_push(stack, current);
13146 SvREFCNT_inc(top); /* Counters the '_dec' done
13147 just after the 'break', so
13148 it doesn't get wrongly freed
13153 _invlist_invert(current);
13155 /* Unlike binary operators, the top of the stack,
13156 * now that this unary one has been popped off, may
13157 * legally be an operator, and we now have operand
13160 SvREFCNT_dec_NN(top);
13161 goto handle_operand;
13164 prev = av_pop(stack);
13165 _invlist_intersection(prev,
13168 av_push(stack, current);
13173 prev = av_pop(stack);
13174 _invlist_union(prev, current, ¤t);
13175 av_push(stack, current);
13179 prev = av_pop(stack);;
13180 _invlist_subtract(prev, current, ¤t);
13181 av_push(stack, current);
13184 case '^': /* The union minus the intersection */
13190 prev = av_pop(stack);
13191 _invlist_union(prev, current, &u);
13192 _invlist_intersection(prev, current, &i);
13193 /* _invlist_subtract will overwrite current
13194 without freeing what it already contains */
13196 _invlist_subtract(u, i, ¤t);
13197 av_push(stack, current);
13198 SvREFCNT_dec_NN(i);
13199 SvREFCNT_dec_NN(u);
13200 SvREFCNT_dec_NN(element);
13205 Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
13207 SvREFCNT_dec_NN(top);
13208 SvREFCNT_dec(prev);
13212 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13215 if (av_tindex(stack) < 0 /* Was empty */
13216 || ((final = av_pop(stack)) == NULL)
13217 || ! IS_OPERAND(final)
13218 || av_tindex(stack) >= 0) /* More left on stack */
13220 vFAIL("Incomplete expression within '(?[ ])'");
13223 /* Here, 'final' is the resultant inversion list from evaluating the
13224 * expression. Return it if so requested */
13225 if (return_invlist) {
13226 *return_invlist = final;
13230 /* Otherwise generate a resultant node, based on 'final'. regclass() is
13231 * expecting a string of ranges and individual code points */
13232 invlist_iterinit(final);
13233 result_string = newSVpvs("");
13234 while (invlist_iternext(final, &start, &end)) {
13235 if (start == end) {
13236 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start);
13239 Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}",
13244 save_parse = RExC_parse;
13245 RExC_parse = SvPV(result_string, len);
13246 save_end = RExC_end;
13247 RExC_end = RExC_parse + len;
13249 /* We turn off folding around the call, as the class we have constructed
13250 * already has all folding taken into consideration, and we don't want
13251 * regclass() to add to that */
13252 RExC_flags &= ~RXf_PMf_FOLD;
13253 /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
13255 node = regclass(pRExC_state, flagp,depth+1,
13256 FALSE, /* means parse the whole char class */
13257 FALSE, /* don't allow multi-char folds */
13258 TRUE, /* silence non-portable warnings. The above may very
13259 well have generated non-portable code points, but
13260 they're valid on this machine */
13263 FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
13266 RExC_flags |= RXf_PMf_FOLD;
13268 RExC_parse = save_parse + 1;
13269 RExC_end = save_end;
13270 SvREFCNT_dec_NN(final);
13271 SvREFCNT_dec_NN(result_string);
13273 nextchar(pRExC_state);
13274 Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
13280 S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
13282 /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
13283 * innocent-looking character class, like /[ks]/i won't have to go out to
13284 * disk to find the possible matches.
13286 * This should be called only for a Latin1-range code points, cp, which is
13287 * known to be involved in a simple fold with other code points above
13288 * Latin1. It would give false results if /aa has been specified.
13289 * Multi-char folds are outside the scope of this, and must be handled
13292 * XXX It would be better to generate these via regen, in case a new
13293 * version of the Unicode standard adds new mappings, though that is not
13294 * really likely, and may be caught by the default: case of the switch
13297 PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
13299 assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
13305 add_cp_to_invlist(*invlist, KELVIN_SIGN);
13309 *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
13312 *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
13313 *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
13315 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
13316 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
13317 *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
13319 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
13320 *invlist = add_cp_to_invlist(*invlist,
13321 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
13323 case LATIN_SMALL_LETTER_SHARP_S:
13324 *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
13327 /* Use deprecated warning to increase the chances of this being
13330 ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
13337 S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
13339 /* This adds the string scalar <multi_string> to the array
13340 * <multi_char_matches>. <multi_string> is known to have exactly
13341 * <cp_count> code points in it. This is used when constructing a
13342 * bracketed character class and we find something that needs to match more
13343 * than a single character.
13345 * <multi_char_matches> is actually an array of arrays. Each top-level
13346 * element is an array that contains all the strings known so far that are
13347 * the same length. And that length (in number of code points) is the same
13348 * as the index of the top-level array. Hence, the [2] element is an
13349 * array, each element thereof is a string containing TWO code points;
13350 * while element [3] is for strings of THREE characters, and so on. Since
13351 * this is for multi-char strings there can never be a [0] nor [1] element.
13353 * When we rewrite the character class below, we will do so such that the
13354 * longest strings are written first, so that it prefers the longest
13355 * matching strings first. This is done even if it turns out that any
13356 * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
13357 * Christiansen has agreed that this is ok. This makes the test for the
13358 * ligature 'ffi' come before the test for 'ff', for example */
13361 AV** this_array_ptr;
13363 PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
13365 if (! multi_char_matches) {
13366 multi_char_matches = newAV();
13369 if (av_exists(multi_char_matches, cp_count)) {
13370 this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
13371 this_array = *this_array_ptr;
13374 this_array = newAV();
13375 av_store(multi_char_matches, cp_count,
13378 av_push(this_array, multi_string);
13380 return multi_char_matches;
13383 /* The names of properties whose definitions are not known at compile time are
13384 * stored in this SV, after a constant heading. So if the length has been
13385 * changed since initialization, then there is a run-time definition. */
13386 #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
13387 (SvCUR(listsv) != initial_listsv_len)
13390 S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
13391 const bool stop_at_1, /* Just parse the next thing, don't
13392 look for a full character class */
13393 bool allow_multi_folds,
13394 const bool silence_non_portable, /* Don't output warnings
13397 SV** ret_invlist) /* Return an inversion list, not a node */
13399 /* parse a bracketed class specification. Most of these will produce an
13400 * ANYOF node; but something like [a] will produce an EXACT node; [aA], an
13401 * EXACTFish node; [[:ascii:]], a POSIXA node; etc. It is more complex
13402 * under /i with multi-character folds: it will be rewritten following the
13403 * paradigm of this example, where the <multi-fold>s are characters which
13404 * fold to multiple character sequences:
13405 * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
13406 * gets effectively rewritten as:
13407 * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
13408 * reg() gets called (recursively) on the rewritten version, and this
13409 * function will return what it constructs. (Actually the <multi-fold>s
13410 * aren't physically removed from the [abcdefghi], it's just that they are
13411 * ignored in the recursion by means of a flag:
13412 * <RExC_in_multi_char_class>.)
13414 * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
13415 * characters, with the corresponding bit set if that character is in the
13416 * list. For characters above this, a range list or swash is used. There
13417 * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
13418 * determinable at compile time
13420 * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
13421 * to be restarted. This can only happen if ret_invlist is non-NULL.
13424 UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
13426 UV value = OOB_UNICODE, save_value = OOB_UNICODE;
13429 IV namedclass = OOB_NAMEDCLASS;
13430 char *rangebegin = NULL;
13431 bool need_class = 0;
13433 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
13434 than just initialized. */
13435 SV* properties = NULL; /* Code points that match \p{} \P{} */
13436 SV* posixes = NULL; /* Code points that match classes like [:word:],
13437 extended beyond the Latin1 range. These have to
13438 be kept separate from other code points for much
13439 of this function because their handling is
13440 different under /i, and for most classes under
13442 SV* nposixes = NULL; /* Similarly for [:^word:]. These are kept
13443 separate for a while from the non-complemented
13444 versions because of complications with /d
13446 UV element_count = 0; /* Number of distinct elements in the class.
13447 Optimizations may be possible if this is tiny */
13448 AV * multi_char_matches = NULL; /* Code points that fold to more than one
13449 character; used under /i */
13451 char * stop_ptr = RExC_end; /* where to stop parsing */
13452 const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
13454 const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
13456 /* Unicode properties are stored in a swash; this holds the current one
13457 * being parsed. If this swash is the only above-latin1 component of the
13458 * character class, an optimization is to pass it directly on to the
13459 * execution engine. Otherwise, it is set to NULL to indicate that there
13460 * are other things in the class that have to be dealt with at execution
13462 SV* swash = NULL; /* Code points that match \p{} \P{} */
13464 /* Set if a component of this character class is user-defined; just passed
13465 * on to the engine */
13466 bool has_user_defined_property = FALSE;
13468 /* inversion list of code points this node matches only when the target
13469 * string is in UTF-8. (Because is under /d) */
13470 SV* depends_list = NULL;
13472 /* Inversion list of code points this node matches regardless of things
13473 * like locale, folding, utf8ness of the target string */
13474 SV* cp_list = NULL;
13476 /* Like cp_list, but code points on this list need to be checked for things
13477 * that fold to/from them under /i */
13478 SV* cp_foldable_list = NULL;
13480 /* Like cp_list, but code points on this list are valid only when the
13481 * runtime locale is UTF-8 */
13482 SV* only_utf8_locale_list = NULL;
13485 /* In a range, counts how many 0-2 of the ends of it came from literals,
13486 * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
13487 UV literal_endpoint = 0;
13489 bool invert = FALSE; /* Is this class to be complemented */
13491 bool warn_super = ALWAYS_WARN_SUPER;
13493 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
13494 case we need to change the emitted regop to an EXACT. */
13495 const char * orig_parse = RExC_parse;
13496 const SSize_t orig_size = RExC_size;
13497 bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
13498 GET_RE_DEBUG_FLAGS_DECL;
13500 PERL_ARGS_ASSERT_REGCLASS;
13502 PERL_UNUSED_ARG(depth);
13505 DEBUG_PARSE("clas");
13507 /* Assume we are going to generate an ANYOF node. */
13508 ret = reganode(pRExC_state, ANYOF, 0);
13511 RExC_size += ANYOF_SKIP;
13512 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
13515 ANYOF_FLAGS(ret) = 0;
13517 RExC_emit += ANYOF_SKIP;
13518 listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
13519 initial_listsv_len = SvCUR(listsv);
13520 SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
13524 RExC_parse = regpatws(pRExC_state, RExC_parse,
13525 FALSE /* means don't recognize comments */ );
13528 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
13531 allow_multi_folds = FALSE;
13534 RExC_parse = regpatws(pRExC_state, RExC_parse,
13535 FALSE /* means don't recognize comments */ );
13539 /* Check that they didn't say [:posix:] instead of [[:posix:]] */
13540 if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) {
13541 const char *s = RExC_parse;
13542 const char c = *s++;
13544 while (isWORDCHAR(*s))
13546 if (*s && c == *s && s[1] == ']') {
13547 SAVEFREESV(RExC_rx_sv);
13549 "POSIX syntax [%c %c] belongs inside character classes",
13551 (void)ReREFCNT_inc(RExC_rx_sv);
13555 /* If the caller wants us to just parse a single element, accomplish this
13556 * by faking the loop ending condition */
13557 if (stop_at_1 && RExC_end > RExC_parse) {
13558 stop_ptr = RExC_parse + 1;
13561 /* allow 1st char to be ']' (allowing it to be '-' is dealt with later) */
13562 if (UCHARAT(RExC_parse) == ']')
13563 goto charclassloop;
13566 if (RExC_parse >= stop_ptr) {
13571 RExC_parse = regpatws(pRExC_state, RExC_parse,
13572 FALSE /* means don't recognize comments */ );
13575 if (UCHARAT(RExC_parse) == ']') {
13581 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
13582 save_value = value;
13583 save_prevvalue = prevvalue;
13586 rangebegin = RExC_parse;
13590 value = utf8n_to_uvchr((U8*)RExC_parse,
13591 RExC_end - RExC_parse,
13592 &numlen, UTF8_ALLOW_DEFAULT);
13593 RExC_parse += numlen;
13596 value = UCHARAT(RExC_parse++);
13599 && RExC_parse < RExC_end
13600 && POSIXCC(UCHARAT(RExC_parse)))
13602 namedclass = regpposixcc(pRExC_state, value, strict);
13604 else if (value != '\\') {
13606 literal_endpoint++;
13610 /* Is a backslash; get the code point of the char after it */
13611 if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
13612 value = utf8n_to_uvchr((U8*)RExC_parse,
13613 RExC_end - RExC_parse,
13614 &numlen, UTF8_ALLOW_DEFAULT);
13615 RExC_parse += numlen;
13618 value = UCHARAT(RExC_parse++);
13620 /* Some compilers cannot handle switching on 64-bit integer
13621 * values, therefore value cannot be an UV. Yes, this will
13622 * be a problem later if we want switch on Unicode.
13623 * A similar issue a little bit later when switching on
13624 * namedclass. --jhi */
13626 /* If the \ is escaping white space when white space is being
13627 * skipped, it means that that white space is wanted literally, and
13628 * is already in 'value'. Otherwise, need to translate the escape
13629 * into what it signifies. */
13630 if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
13632 case 'w': namedclass = ANYOF_WORDCHAR; break;
13633 case 'W': namedclass = ANYOF_NWORDCHAR; break;
13634 case 's': namedclass = ANYOF_SPACE; break;
13635 case 'S': namedclass = ANYOF_NSPACE; break;
13636 case 'd': namedclass = ANYOF_DIGIT; break;
13637 case 'D': namedclass = ANYOF_NDIGIT; break;
13638 case 'v': namedclass = ANYOF_VERTWS; break;
13639 case 'V': namedclass = ANYOF_NVERTWS; break;
13640 case 'h': namedclass = ANYOF_HORIZWS; break;
13641 case 'H': namedclass = ANYOF_NHORIZWS; break;
13642 case 'N': /* Handle \N{NAME} in class */
13645 STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
13646 flagp, depth, &as_text);
13647 if (*flagp & RESTART_UTF8)
13648 FAIL("panic: grok_bslash_N set RESTART_UTF8");
13649 if (cp_count != 1) { /* The typical case drops through */
13650 assert(cp_count != (STRLEN) -1);
13651 if (cp_count == 0) {
13653 RExC_parse++; /* Position after the "}" */
13654 vFAIL("Zero length \\N{}");
13657 ckWARNreg(RExC_parse,
13658 "Ignoring zero length \\N{} in character class");
13661 else { /* cp_count > 1 */
13662 if (! RExC_in_multi_char_class) {
13663 if (invert || range || *RExC_parse == '-') {
13666 vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
13669 ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
13674 = add_multi_match(multi_char_matches,
13678 break; /* <value> contains the first code
13679 point. Drop out of the switch to
13682 } /* End of cp_count != 1 */
13684 /* This element should not be processed further in this
13687 value = save_value;
13688 prevvalue = save_prevvalue;
13689 continue; /* Back to top of loop to get next char */
13691 /* Here, is a single code point, and <value> contains it */
13699 /* We will handle any undefined properties ourselves */
13700 U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF
13701 /* And we actually would prefer to get
13702 * the straight inversion list of the
13703 * swash, since we will be accessing it
13704 * anyway, to save a little time */
13705 |_CORE_SWASH_INIT_ACCEPT_INVLIST;
13707 if (RExC_parse >= RExC_end)
13708 vFAIL2("Empty \\%c{}", (U8)value);
13709 if (*RExC_parse == '{') {
13710 const U8 c = (U8)value;
13711 e = strchr(RExC_parse++, '}');
13713 vFAIL2("Missing right brace on \\%c{}", c);
13714 while (isSPACE(*RExC_parse))
13716 if (e == RExC_parse)
13717 vFAIL2("Empty \\%c{}", c);
13718 n = e - RExC_parse;
13719 while (isSPACE(*(RExC_parse + n - 1)))
13730 if (UCHARAT(RExC_parse) == '^') {
13733 /* toggle. (The rhs xor gets the single bit that
13734 * differs between P and p; the other xor inverts just
13736 value ^= 'P' ^ 'p';
13738 while (isSPACE(*RExC_parse)) {
13743 /* Try to get the definition of the property into
13744 * <invlist>. If /i is in effect, the effective property
13745 * will have its name be <__NAME_i>. The design is
13746 * discussed in commit
13747 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
13748 name = savepv(Perl_form(aTHX_
13750 (FOLD) ? "__" : "",
13756 /* Look up the property name, and get its swash and
13757 * inversion list, if the property is found */
13759 SvREFCNT_dec_NN(swash);
13761 swash = _core_swash_init("utf8", name, &PL_sv_undef,
13764 NULL, /* No inversion list */
13767 if (! swash || ! (invlist = _get_swash_invlist(swash))) {
13768 HV* curpkg = (IN_PERL_COMPILETIME)
13770 : CopSTASH(PL_curcop);
13772 SvREFCNT_dec_NN(swash);
13776 /* Here didn't find it. It could be a user-defined
13777 * property that will be available at run-time. If we
13778 * accept only compile-time properties, is an error;
13779 * otherwise add it to the list for run-time look up */
13781 RExC_parse = e + 1;
13783 "Property '%"UTF8f"' is unknown",
13784 UTF8fARG(UTF, n, name));
13787 /* If the property name doesn't already have a package
13788 * name, add the current one to it so that it can be
13789 * referred to outside it. [perl #121777] */
13790 if (curpkg && ! instr(name, "::")) {
13791 char* pkgname = HvNAME(curpkg);
13792 if (strNE(pkgname, "main")) {
13793 char* full_name = Perl_form(aTHX_
13797 n = strlen(full_name);
13799 name = savepvn(full_name, n);
13802 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
13803 (value == 'p' ? '+' : '!'),
13804 UTF8fARG(UTF, n, name));
13805 has_user_defined_property = TRUE;
13807 /* We don't know yet, so have to assume that the
13808 * property could match something in the Latin1 range,
13809 * hence something that isn't utf8. Note that this
13810 * would cause things in <depends_list> to match
13811 * inappropriately, except that any \p{}, including
13812 * this one forces Unicode semantics, which means there
13813 * is no <depends_list> */
13815 |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
13819 /* Here, did get the swash and its inversion list. If
13820 * the swash is from a user-defined property, then this
13821 * whole character class should be regarded as such */
13822 if (swash_init_flags
13823 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)
13825 has_user_defined_property = TRUE;
13828 /* We warn on matching an above-Unicode code point
13829 * if the match would return true, except don't
13830 * warn for \p{All}, which has exactly one element
13832 (_invlist_contains_cp(invlist, 0x110000)
13833 && (! (_invlist_len(invlist) == 1
13834 && *invlist_array(invlist) == 0)))
13840 /* Invert if asking for the complement */
13841 if (value == 'P') {
13842 _invlist_union_complement_2nd(properties,
13846 /* The swash can't be used as-is, because we've
13847 * inverted things; delay removing it to here after
13848 * have copied its invlist above */
13849 SvREFCNT_dec_NN(swash);
13853 _invlist_union(properties, invlist, &properties);
13858 RExC_parse = e + 1;
13859 namedclass = ANYOF_UNIPROP; /* no official name, but it's
13862 /* \p means they want Unicode semantics */
13863 RExC_uni_semantics = 1;
13866 case 'n': value = '\n'; break;
13867 case 'r': value = '\r'; break;
13868 case 't': value = '\t'; break;
13869 case 'f': value = '\f'; break;
13870 case 'b': value = '\b'; break;
13871 case 'e': value = ESC_NATIVE; break;
13872 case 'a': value = '\a'; break;
13874 RExC_parse--; /* function expects to be pointed at the 'o' */
13876 const char* error_msg;
13877 bool valid = grok_bslash_o(&RExC_parse,
13880 PASS2, /* warnings only in
13883 silence_non_portable,
13889 if (PL_encoding && value < 0x100) {
13890 goto recode_encoding;
13894 RExC_parse--; /* function expects to be pointed at the 'x' */
13896 const char* error_msg;
13897 bool valid = grok_bslash_x(&RExC_parse,
13900 PASS2, /* Output warnings */
13902 silence_non_portable,
13908 if (PL_encoding && value < 0x100)
13909 goto recode_encoding;
13912 value = grok_bslash_c(*RExC_parse++, PASS2);
13914 case '0': case '1': case '2': case '3': case '4':
13915 case '5': case '6': case '7':
13917 /* Take 1-3 octal digits */
13918 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
13919 numlen = (strict) ? 4 : 3;
13920 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
13921 RExC_parse += numlen;
13924 RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
13925 vFAIL("Need exactly 3 octal digits");
13927 else if (! SIZE_ONLY /* like \08, \178 */
13929 && RExC_parse < RExC_end
13930 && isDIGIT(*RExC_parse)
13931 && ckWARN(WARN_REGEXP))
13933 SAVEFREESV(RExC_rx_sv);
13934 reg_warn_non_literal_string(
13936 form_short_octal_warning(RExC_parse, numlen));
13937 (void)ReREFCNT_inc(RExC_rx_sv);
13940 if (PL_encoding && value < 0x100)
13941 goto recode_encoding;
13945 if (! RExC_override_recoding) {
13946 SV* enc = PL_encoding;
13947 value = reg_recode((const char)(U8)value, &enc);
13950 vFAIL("Invalid escape in the specified encoding");
13953 ckWARNreg(RExC_parse,
13954 "Invalid escape in the specified encoding");
13960 /* Allow \_ to not give an error */
13961 if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
13963 vFAIL2("Unrecognized escape \\%c in character class",
13967 SAVEFREESV(RExC_rx_sv);
13968 ckWARN2reg(RExC_parse,
13969 "Unrecognized escape \\%c in character class passed through",
13971 (void)ReREFCNT_inc(RExC_rx_sv);
13975 } /* End of switch on char following backslash */
13976 } /* end of handling backslash escape sequences */
13978 /* Here, we have the current token in 'value' */
13980 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
13983 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
13984 * literal, as is the character that began the false range, i.e.
13985 * the 'a' in the examples */
13988 const int w = (RExC_parse >= rangebegin)
13989 ? RExC_parse - rangebegin
13993 "False [] range \"%"UTF8f"\"",
13994 UTF8fARG(UTF, w, rangebegin));
13997 SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
13998 ckWARN2reg(RExC_parse,
13999 "False [] range \"%"UTF8f"\"",
14000 UTF8fARG(UTF, w, rangebegin));
14001 (void)ReREFCNT_inc(RExC_rx_sv);
14002 cp_list = add_cp_to_invlist(cp_list, '-');
14003 cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
14008 range = 0; /* this was not a true range */
14009 element_count += 2; /* So counts for three values */
14012 classnum = namedclass_to_classnum(namedclass);
14014 if (LOC && namedclass < ANYOF_POSIXL_MAX
14015 #ifndef HAS_ISASCII
14016 && classnum != _CC_ASCII
14019 /* What the Posix classes (like \w, [:space:]) match in locale
14020 * isn't knowable under locale until actual match time. Room
14021 * must be reserved (one time per outer bracketed class) to
14022 * store such classes. The space will contain a bit for each
14023 * named class that is to be matched against. This isn't
14024 * needed for \p{} and pseudo-classes, as they are not affected
14025 * by locale, and hence are dealt with separately */
14026 if (! need_class) {
14029 RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14032 RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
14034 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
14035 ANYOF_POSIXL_ZERO(ret);
14038 /* Coverity thinks it is possible for this to be negative; both
14039 * jhi and khw think it's not, but be safer */
14040 assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14041 || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
14043 /* See if it already matches the complement of this POSIX
14045 if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL)
14046 && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2)
14050 posixl_matches_all = TRUE;
14051 break; /* No need to continue. Since it matches both
14052 e.g., \w and \W, it matches everything, and the
14053 bracketed class can be optimized into qr/./s */
14056 /* Add this class to those that should be checked at runtime */
14057 ANYOF_POSIXL_SET(ret, namedclass);
14059 /* The above-Latin1 characters are not subject to locale rules.
14060 * Just add them, in the second pass, to the
14061 * unconditionally-matched list */
14063 SV* scratch_list = NULL;
14065 /* Get the list of the above-Latin1 code points this
14067 _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
14068 PL_XPosix_ptrs[classnum],
14070 /* Odd numbers are complements, like
14071 * NDIGIT, NASCII, ... */
14072 namedclass % 2 != 0,
14074 /* Checking if 'cp_list' is NULL first saves an extra
14075 * clone. Its reference count will be decremented at the
14076 * next union, etc, or if this is the only instance, at the
14077 * end of the routine */
14079 cp_list = scratch_list;
14082 _invlist_union(cp_list, scratch_list, &cp_list);
14083 SvREFCNT_dec_NN(scratch_list);
14085 continue; /* Go get next character */
14088 else if (! SIZE_ONLY) {
14090 /* Here, not in pass1 (in that pass we skip calculating the
14091 * contents of this class), and is /l, or is a POSIX class for
14092 * which /l doesn't matter (or is a Unicode property, which is
14093 * skipped here). */
14094 if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
14095 if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
14097 /* Here, should be \h, \H, \v, or \V. None of /d, /i
14098 * nor /l make a difference in what these match,
14099 * therefore we just add what they match to cp_list. */
14100 if (classnum != _CC_VERTSPACE) {
14101 assert( namedclass == ANYOF_HORIZWS
14102 || namedclass == ANYOF_NHORIZWS);
14104 /* It turns out that \h is just a synonym for
14106 classnum = _CC_BLANK;
14109 _invlist_union_maybe_complement_2nd(
14111 PL_XPosix_ptrs[classnum],
14112 namedclass % 2 != 0, /* Complement if odd
14113 (NHORIZWS, NVERTWS)
14118 else { /* Garden variety class. If is NASCII, NDIGIT, ...
14119 complement and use nposixes */
14120 SV** posixes_ptr = namedclass % 2 == 0
14123 SV** source_ptr = &PL_XPosix_ptrs[classnum];
14124 _invlist_union_maybe_complement_2nd(
14127 namedclass % 2 != 0,
14131 } /* end of namedclass \blah */
14134 RExC_parse = regpatws(pRExC_state, RExC_parse,
14135 FALSE /* means don't recognize comments */ );
14138 /* If 'range' is set, 'value' is the ending of a range--check its
14139 * validity. (If value isn't a single code point in the case of a
14140 * range, we should have figured that out above in the code that
14141 * catches false ranges). Later, we will handle each individual code
14142 * point in the range. If 'range' isn't set, this could be the
14143 * beginning of a range, so check for that by looking ahead to see if
14144 * the next real character to be processed is the range indicator--the
14148 if (prevvalue > value) /* b-a */ {
14149 const int w = RExC_parse - rangebegin;
14151 "Invalid [] range \"%"UTF8f"\"",
14152 UTF8fARG(UTF, w, rangebegin));
14153 range = 0; /* not a valid range */
14157 prevvalue = value; /* save the beginning of the potential range */
14158 if (! stop_at_1 /* Can't be a range if parsing just one thing */
14159 && *RExC_parse == '-')
14161 char* next_char_ptr = RExC_parse + 1;
14162 if (skip_white) { /* Get the next real char after the '-' */
14163 next_char_ptr = regpatws(pRExC_state,
14165 FALSE); /* means don't recognize
14169 /* If the '-' is at the end of the class (just before the ']',
14170 * it is a literal minus; otherwise it is a range */
14171 if (next_char_ptr < RExC_end && *next_char_ptr != ']') {
14172 RExC_parse = next_char_ptr;
14174 /* a bad range like \w-, [:word:]- ? */
14175 if (namedclass > OOB_NAMEDCLASS) {
14176 if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
14177 const int w = RExC_parse >= rangebegin
14178 ? RExC_parse - rangebegin
14181 vFAIL4("False [] range \"%*.*s\"",
14186 "False [] range \"%*.*s\"",
14191 cp_list = add_cp_to_invlist(cp_list, '-');
14195 range = 1; /* yeah, it's a range! */
14196 continue; /* but do it the next time */
14201 if (namedclass > OOB_NAMEDCLASS) {
14205 /* Here, we have a single value, and <prevvalue> is the beginning of
14206 * the range, if any; or <value> if not */
14208 /* non-Latin1 code point implies unicode semantics. Must be set in
14209 * pass1 so is there for the whole of pass 2 */
14211 RExC_uni_semantics = 1;
14214 /* Ready to process either the single value, or the completed range.
14215 * For single-valued non-inverted ranges, we consider the possibility
14216 * of multi-char folds. (We made a conscious decision to not do this
14217 * for the other cases because it can often lead to non-intuitive
14218 * results. For example, you have the peculiar case that:
14219 * "s s" =~ /^[^\xDF]+$/i => Y
14220 * "ss" =~ /^[^\xDF]+$/i => N
14222 * See [perl #89750] */
14223 if (FOLD && allow_multi_folds && value == prevvalue) {
14224 if (value == LATIN_SMALL_LETTER_SHARP_S
14225 || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
14228 /* Here <value> is indeed a multi-char fold. Get what it is */
14230 U8 foldbuf[UTF8_MAXBYTES_CASE];
14233 UV folded = _to_uni_fold_flags(
14237 FOLD_FLAGS_FULL | (ASCII_FOLD_RESTRICTED
14238 ? FOLD_FLAGS_NOMIX_ASCII
14242 /* Here, <folded> should be the first character of the
14243 * multi-char fold of <value>, with <foldbuf> containing the
14244 * whole thing. But, if this fold is not allowed (because of
14245 * the flags), <fold> will be the same as <value>, and should
14246 * be processed like any other character, so skip the special
14248 if (folded != value) {
14250 /* Skip if we are recursed, currently parsing the class
14251 * again. Otherwise add this character to the list of
14252 * multi-char folds. */
14253 if (! RExC_in_multi_char_class) {
14254 STRLEN cp_count = utf8_length(foldbuf,
14255 foldbuf + foldlen);
14256 SV* multi_fold = sv_2mortal(newSVpvs(""));
14258 Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
14261 = add_multi_match(multi_char_matches,
14267 /* This element should not be processed further in this
14270 value = save_value;
14271 prevvalue = save_prevvalue;
14277 /* Deal with this element of the class */
14280 cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
14283 SV* this_range = _new_invlist(1);
14284 _append_range_to_invlist(this_range, prevvalue, value);
14286 /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
14287 * If this range was specified using something like 'i-j', we want
14288 * to include only the 'i' and the 'j', and not anything in
14289 * between, so exclude non-ASCII, non-alphabetics from it.
14290 * However, if the range was specified with something like
14291 * [\x89-\x91] or [\x89-j], all code points within it should be
14292 * included. literal_endpoint==2 means both ends of the range used
14293 * a literal character, not \x{foo} */
14294 if (literal_endpoint == 2
14295 && ((prevvalue >= 'a' && value <= 'z')
14296 || (prevvalue >= 'A' && value <= 'Z')))
14298 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
14301 /* Since this above only contains ascii, the intersection of it
14302 * with anything will still yield only ascii */
14303 _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
14306 _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
14307 literal_endpoint = 0;
14311 range = 0; /* this range (if it was one) is done now */
14312 } /* End of loop through all the text within the brackets */
14314 /* If anything in the class expands to more than one character, we have to
14315 * deal with them by building up a substitute parse string, and recursively
14316 * calling reg() on it, instead of proceeding */
14317 if (multi_char_matches) {
14318 SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
14321 char *save_end = RExC_end;
14322 char *save_parse = RExC_parse;
14323 bool first_time = TRUE; /* First multi-char occurrence doesn't get
14328 #if 0 /* Have decided not to deal with multi-char folds in inverted classes,
14329 because too confusing */
14331 sv_catpv(substitute_parse, "(?:");
14335 /* Look at the longest folds first */
14336 for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
14338 if (av_exists(multi_char_matches, cp_count)) {
14339 AV** this_array_ptr;
14342 this_array_ptr = (AV**) av_fetch(multi_char_matches,
14344 while ((this_sequence = av_pop(*this_array_ptr)) !=
14347 if (! first_time) {
14348 sv_catpv(substitute_parse, "|");
14350 first_time = FALSE;
14352 sv_catpv(substitute_parse, SvPVX(this_sequence));
14357 /* If the character class contains anything else besides these
14358 * multi-character folds, have to include it in recursive parsing */
14359 if (element_count) {
14360 sv_catpv(substitute_parse, "|[");
14361 sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
14362 sv_catpv(substitute_parse, "]");
14365 sv_catpv(substitute_parse, ")");
14368 /* This is a way to get the parse to skip forward a whole named
14369 * sequence instead of matching the 2nd character when it fails the
14371 sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
14375 RExC_parse = SvPV(substitute_parse, len);
14376 RExC_end = RExC_parse + len;
14377 RExC_in_multi_char_class = 1;
14378 RExC_override_recoding = 1;
14379 RExC_emit = (regnode *)orig_emit;
14381 ret = reg(pRExC_state, 1, ®_flags, depth+1);
14383 *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
14385 RExC_parse = save_parse;
14386 RExC_end = save_end;
14387 RExC_in_multi_char_class = 0;
14388 RExC_override_recoding = 0;
14389 SvREFCNT_dec_NN(multi_char_matches);
14393 /* Here, we've gone through the entire class and dealt with multi-char
14394 * folds. We are now in a position that we can do some checks to see if we
14395 * can optimize this ANYOF node into a simpler one, even in Pass 1.
14396 * Currently we only do two checks:
14397 * 1) is in the unlikely event that the user has specified both, eg. \w and
14398 * \W under /l, then the class matches everything. (This optimization
14399 * is done only to make the optimizer code run later work.)
14400 * 2) if the character class contains only a single element (including a
14401 * single range), we see if there is an equivalent node for it.
14402 * Other checks are possible */
14403 if (! ret_invlist /* Can't optimize if returning the constructed
14405 && (UNLIKELY(posixl_matches_all) || element_count == 1))
14410 if (UNLIKELY(posixl_matches_all)) {
14413 else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like
14414 \w or [:digit:] or \p{foo}
14417 /* All named classes are mapped into POSIXish nodes, with its FLAG
14418 * argument giving which class it is */
14419 switch ((I32)namedclass) {
14420 case ANYOF_UNIPROP:
14423 /* These don't depend on the charset modifiers. They always
14424 * match under /u rules */
14425 case ANYOF_NHORIZWS:
14426 case ANYOF_HORIZWS:
14427 namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
14430 case ANYOF_NVERTWS:
14435 /* The actual POSIXish node for all the rest depends on the
14436 * charset modifier. The ones in the first set depend only on
14437 * ASCII or, if available on this platform, locale */
14441 op = (LOC) ? POSIXL : POSIXA;
14452 /* under /a could be alpha */
14454 if (ASCII_RESTRICTED) {
14455 namedclass = ANYOF_ALPHA + (namedclass % 2);
14463 /* The rest have more possibilities depending on the charset.
14464 * We take advantage of the enum ordering of the charset
14465 * modifiers to get the exact node type, */
14467 op = POSIXD + get_regex_charset(RExC_flags);
14468 if (op > POSIXA) { /* /aa is same as /a */
14473 /* The odd numbered ones are the complements of the
14474 * next-lower even number one */
14475 if (namedclass % 2 == 1) {
14479 arg = namedclass_to_classnum(namedclass);
14483 else if (value == prevvalue) {
14485 /* Here, the class consists of just a single code point */
14488 if (! LOC && value == '\n') {
14489 op = REG_ANY; /* Optimize [^\n] */
14490 *flagp |= HASWIDTH|SIMPLE;
14494 else if (value < 256 || UTF) {
14496 /* Optimize a single value into an EXACTish node, but not if it
14497 * would require converting the pattern to UTF-8. */
14498 op = compute_EXACTish(pRExC_state);
14500 } /* Otherwise is a range */
14501 else if (! LOC) { /* locale could vary these */
14502 if (prevvalue == '0') {
14503 if (value == '9') {
14508 else if (prevvalue == 'A') {
14511 && literal_endpoint == 2
14514 arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
14518 else if (prevvalue == 'a') {
14521 && literal_endpoint == 2
14524 arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
14530 /* Here, we have changed <op> away from its initial value iff we found
14531 * an optimization */
14534 /* Throw away this ANYOF regnode, and emit the calculated one,
14535 * which should correspond to the beginning, not current, state of
14537 const char * cur_parse = RExC_parse;
14538 RExC_parse = (char *)orig_parse;
14542 /* To get locale nodes to not use the full ANYOF size would
14543 * require moving the code above that writes the portions
14544 * of it that aren't in other nodes to after this point.
14545 * e.g. ANYOF_POSIXL_SET */
14546 RExC_size = orig_size;
14550 RExC_emit = (regnode *)orig_emit;
14551 if (PL_regkind[op] == POSIXD) {
14552 if (op == POSIXL) {
14553 RExC_contains_locale = 1;
14556 op += NPOSIXD - POSIXD;
14561 ret = reg_node(pRExC_state, op);
14563 if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
14567 *flagp |= HASWIDTH|SIMPLE;
14569 else if (PL_regkind[op] == EXACT) {
14570 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
14571 TRUE /* downgradable to EXACT */
14575 RExC_parse = (char *) cur_parse;
14577 SvREFCNT_dec(posixes);
14578 SvREFCNT_dec(nposixes);
14579 SvREFCNT_dec(cp_list);
14580 SvREFCNT_dec(cp_foldable_list);
14587 /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
14589 /* If folding, we calculate all characters that could fold to or from the
14590 * ones already on the list */
14591 if (cp_foldable_list) {
14593 UV start, end; /* End points of code point ranges */
14595 SV* fold_intersection = NULL;
14598 /* Our calculated list will be for Unicode rules. For locale
14599 * matching, we have to keep a separate list that is consulted at
14600 * runtime only when the locale indicates Unicode rules. For
14601 * non-locale, we just use to the general list */
14603 use_list = &only_utf8_locale_list;
14606 use_list = &cp_list;
14609 /* Only the characters in this class that participate in folds need
14610 * be checked. Get the intersection of this class and all the
14611 * possible characters that are foldable. This can quickly narrow
14612 * down a large class */
14613 _invlist_intersection(PL_utf8_foldable, cp_foldable_list,
14614 &fold_intersection);
14616 /* The folds for all the Latin1 characters are hard-coded into this
14617 * program, but we have to go out to disk to get the others. */
14618 if (invlist_highest(cp_foldable_list) >= 256) {
14620 /* This is a hash that for a particular fold gives all
14621 * characters that are involved in it */
14622 if (! PL_utf8_foldclosures) {
14623 _load_PL_utf8_foldclosures();
14627 /* Now look at the foldable characters in this class individually */
14628 invlist_iterinit(fold_intersection);
14629 while (invlist_iternext(fold_intersection, &start, &end)) {
14632 /* Look at every character in the range */
14633 for (j = start; j <= end; j++) {
14634 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
14640 if (IS_IN_SOME_FOLD_L1(j)) {
14642 /* ASCII is always matched; non-ASCII is matched
14643 * only under Unicode rules (which could happen
14644 * under /l if the locale is a UTF-8 one */
14645 if (isASCII(j) || ! DEPENDS_SEMANTICS) {
14646 *use_list = add_cp_to_invlist(*use_list,
14647 PL_fold_latin1[j]);
14651 add_cp_to_invlist(depends_list,
14652 PL_fold_latin1[j]);
14656 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
14657 && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
14659 add_above_Latin1_folds(pRExC_state,
14666 /* Here is an above Latin1 character. We don't have the
14667 * rules hard-coded for it. First, get its fold. This is
14668 * the simple fold, as the multi-character folds have been
14669 * handled earlier and separated out */
14670 _to_uni_fold_flags(j, foldbuf, &foldlen,
14671 (ASCII_FOLD_RESTRICTED)
14672 ? FOLD_FLAGS_NOMIX_ASCII
14675 /* Single character fold of above Latin1. Add everything in
14676 * its fold closure to the list that this node should match.
14677 * The fold closures data structure is a hash with the keys
14678 * being the UTF-8 of every character that is folded to, like
14679 * 'k', and the values each an array of all code points that
14680 * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
14681 * Multi-character folds are not included */
14682 if ((listp = hv_fetch(PL_utf8_foldclosures,
14683 (char *) foldbuf, foldlen, FALSE)))
14685 AV* list = (AV*) *listp;
14687 for (k = 0; k <= av_tindex(list); k++) {
14688 SV** c_p = av_fetch(list, k, FALSE);
14694 /* /aa doesn't allow folds between ASCII and non- */
14695 if ((ASCII_FOLD_RESTRICTED
14696 && (isASCII(c) != isASCII(j))))
14701 /* Folds under /l which cross the 255/256 boundary
14702 * are added to a separate list. (These are valid
14703 * only when the locale is UTF-8.) */
14704 if (c < 256 && LOC) {
14705 *use_list = add_cp_to_invlist(*use_list, c);
14709 if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
14711 cp_list = add_cp_to_invlist(cp_list, c);
14714 /* Similarly folds involving non-ascii Latin1
14715 * characters under /d are added to their list */
14716 depends_list = add_cp_to_invlist(depends_list,
14723 SvREFCNT_dec_NN(fold_intersection);
14726 /* Now that we have finished adding all the folds, there is no reason
14727 * to keep the foldable list separate */
14728 _invlist_union(cp_list, cp_foldable_list, &cp_list);
14729 SvREFCNT_dec_NN(cp_foldable_list);
14732 /* And combine the result (if any) with any inversion list from posix
14733 * classes. The lists are kept separate up to now because we don't want to
14734 * fold the classes (folding of those is automatically handled by the swash
14735 * fetching code) */
14736 if (posixes || nposixes) {
14737 if (posixes && AT_LEAST_ASCII_RESTRICTED) {
14738 /* Under /a and /aa, nothing above ASCII matches these */
14739 _invlist_intersection(posixes,
14740 PL_XPosix_ptrs[_CC_ASCII],
14744 if (DEPENDS_SEMANTICS) {
14745 /* Under /d, everything in the upper half of the Latin1 range
14746 * matches these complements */
14747 ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
14749 else if (AT_LEAST_ASCII_RESTRICTED) {
14750 /* Under /a and /aa, everything above ASCII matches these
14752 _invlist_union_complement_2nd(nposixes,
14753 PL_XPosix_ptrs[_CC_ASCII],
14757 _invlist_union(posixes, nposixes, &posixes);
14758 SvREFCNT_dec_NN(nposixes);
14761 posixes = nposixes;
14764 if (! DEPENDS_SEMANTICS) {
14766 _invlist_union(cp_list, posixes, &cp_list);
14767 SvREFCNT_dec_NN(posixes);
14774 /* Under /d, we put into a separate list the Latin1 things that
14775 * match only when the target string is utf8 */
14776 SV* nonascii_but_latin1_properties = NULL;
14777 _invlist_intersection(posixes, PL_UpperLatin1,
14778 &nonascii_but_latin1_properties);
14779 _invlist_subtract(posixes, nonascii_but_latin1_properties,
14782 _invlist_union(cp_list, posixes, &cp_list);
14783 SvREFCNT_dec_NN(posixes);
14789 if (depends_list) {
14790 _invlist_union(depends_list, nonascii_but_latin1_properties,
14792 SvREFCNT_dec_NN(nonascii_but_latin1_properties);
14795 depends_list = nonascii_but_latin1_properties;
14800 /* And combine the result (if any) with any inversion list from properties.
14801 * The lists are kept separate up to now so that we can distinguish the two
14802 * in regards to matching above-Unicode. A run-time warning is generated
14803 * if a Unicode property is matched against a non-Unicode code point. But,
14804 * we allow user-defined properties to match anything, without any warning,
14805 * and we also suppress the warning if there is a portion of the character
14806 * class that isn't a Unicode property, and which matches above Unicode, \W
14807 * or [\x{110000}] for example.
14808 * (Note that in this case, unlike the Posix one above, there is no
14809 * <depends_list>, because having a Unicode property forces Unicode
14814 /* If it matters to the final outcome, see if a non-property
14815 * component of the class matches above Unicode. If so, the
14816 * warning gets suppressed. This is true even if just a single
14817 * such code point is specified, as though not strictly correct if
14818 * another such code point is matched against, the fact that they
14819 * are using above-Unicode code points indicates they should know
14820 * the issues involved */
14822 warn_super = ! (invert
14823 ^ (invlist_highest(cp_list) > PERL_UNICODE_MAX));
14826 _invlist_union(properties, cp_list, &cp_list);
14827 SvREFCNT_dec_NN(properties);
14830 cp_list = properties;
14834 ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
14838 /* Here, we have calculated what code points should be in the character
14841 * Now we can see about various optimizations. Fold calculation (which we
14842 * did above) needs to take place before inversion. Otherwise /[^k]/i
14843 * would invert to include K, which under /i would match k, which it
14844 * shouldn't. Therefore we can't invert folded locale now, as it won't be
14845 * folded until runtime */
14847 /* If we didn't do folding, it's because some information isn't available
14848 * until runtime; set the run-time fold flag for these. (We don't have to
14849 * worry about properties folding, as that is taken care of by the swash
14850 * fetching). We know to set the flag if we have a non-NULL list for UTF-8
14851 * locales, or the class matches at least one 0-255 range code point */
14853 if (only_utf8_locale_list) {
14854 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14856 else if (cp_list) { /* Look to see if there a 0-255 code point is in
14859 invlist_iterinit(cp_list);
14860 if (invlist_iternext(cp_list, &start, &end) && start < 256) {
14861 ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
14863 invlist_iterfinish(cp_list);
14867 /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
14868 * at compile time. Besides not inverting folded locale now, we can't
14869 * invert if there are things such as \w, which aren't known until runtime
14873 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14875 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
14877 _invlist_invert(cp_list);
14879 /* Any swash can't be used as-is, because we've inverted things */
14881 SvREFCNT_dec_NN(swash);
14885 /* Clear the invert flag since have just done it here */
14890 *ret_invlist = cp_list;
14891 SvREFCNT_dec(swash);
14893 /* Discard the generated node */
14895 RExC_size = orig_size;
14898 RExC_emit = orig_emit;
14903 /* Some character classes are equivalent to other nodes. Such nodes take
14904 * up less room and generally fewer operations to execute than ANYOF nodes.
14905 * Above, we checked for and optimized into some such equivalents for
14906 * certain common classes that are easy to test. Getting to this point in
14907 * the code means that the class didn't get optimized there. Since this
14908 * code is only executed in Pass 2, it is too late to save space--it has
14909 * been allocated in Pass 1, and currently isn't given back. But turning
14910 * things into an EXACTish node can allow the optimizer to join it to any
14911 * adjacent such nodes. And if the class is equivalent to things like /./,
14912 * expensive run-time swashes can be avoided. Now that we have more
14913 * complete information, we can find things necessarily missed by the
14914 * earlier code. I (khw) am not sure how much to look for here. It would
14915 * be easy, but perhaps too slow, to check any candidates against all the
14916 * node types they could possibly match using _invlistEQ(). */
14921 && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
14922 && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
14924 /* We don't optimize if we are supposed to make sure all non-Unicode
14925 * code points raise a warning, as only ANYOF nodes have this check.
14927 && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
14930 U8 op = END; /* The optimzation node-type */
14931 const char * cur_parse= RExC_parse;
14933 invlist_iterinit(cp_list);
14934 if (! invlist_iternext(cp_list, &start, &end)) {
14936 /* Here, the list is empty. This happens, for example, when a
14937 * Unicode property is the only thing in the character class, and
14938 * it doesn't match anything. (perluniprops.pod notes such
14941 *flagp |= HASWIDTH|SIMPLE;
14943 else if (start == end) { /* The range is a single code point */
14944 if (! invlist_iternext(cp_list, &start, &end)
14946 /* Don't do this optimization if it would require changing
14947 * the pattern to UTF-8 */
14948 && (start < 256 || UTF))
14950 /* Here, the list contains a single code point. Can optimize
14951 * into an EXACTish node */
14960 /* A locale node under folding with one code point can be
14961 * an EXACTFL, as its fold won't be calculated until
14967 /* Here, we are generally folding, but there is only one
14968 * code point to match. If we have to, we use an EXACT
14969 * node, but it would be better for joining with adjacent
14970 * nodes in the optimization pass if we used the same
14971 * EXACTFish node that any such are likely to be. We can
14972 * do this iff the code point doesn't participate in any
14973 * folds. For example, an EXACTF of a colon is the same as
14974 * an EXACT one, since nothing folds to or from a colon. */
14976 if (IS_IN_SOME_FOLD_L1(value)) {
14981 if (_invlist_contains_cp(PL_utf8_foldable, value)) {
14986 /* If we haven't found the node type, above, it means we
14987 * can use the prevailing one */
14989 op = compute_EXACTish(pRExC_state);
14994 else if (start == 0) {
14995 if (end == UV_MAX) {
14997 *flagp |= HASWIDTH|SIMPLE;
15000 else if (end == '\n' - 1
15001 && invlist_iternext(cp_list, &start, &end)
15002 && start == '\n' + 1 && end == UV_MAX)
15005 *flagp |= HASWIDTH|SIMPLE;
15009 invlist_iterfinish(cp_list);
15012 RExC_parse = (char *)orig_parse;
15013 RExC_emit = (regnode *)orig_emit;
15015 ret = reg_node(pRExC_state, op);
15017 RExC_parse = (char *)cur_parse;
15019 if (PL_regkind[op] == EXACT) {
15020 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
15021 TRUE /* downgradable to EXACT */
15025 SvREFCNT_dec_NN(cp_list);
15030 /* Here, <cp_list> contains all the code points we can determine at
15031 * compile time that match under all conditions. Go through it, and
15032 * for things that belong in the bitmap, put them there, and delete from
15033 * <cp_list>. While we are at it, see if everything above 255 is in the
15034 * list, and if so, set a flag to speed up execution */
15036 populate_ANYOF_from_invlist(ret, &cp_list);
15039 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
15042 /* Here, the bitmap has been populated with all the Latin1 code points that
15043 * always match. Can now add to the overall list those that match only
15044 * when the target string is UTF-8 (<depends_list>). */
15045 if (depends_list) {
15047 _invlist_union(cp_list, depends_list, &cp_list);
15048 SvREFCNT_dec_NN(depends_list);
15051 cp_list = depends_list;
15053 ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
15056 /* If there is a swash and more than one element, we can't use the swash in
15057 * the optimization below. */
15058 if (swash && element_count > 1) {
15059 SvREFCNT_dec_NN(swash);
15063 /* Note that the optimization of using 'swash' if it is the only thing in
15064 * the class doesn't have us change swash at all, so it can include things
15065 * that are also in the bitmap; otherwise we have purposely deleted that
15066 * duplicate information */
15067 set_ANYOF_arg(pRExC_state, ret, cp_list,
15068 (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
15070 only_utf8_locale_list,
15071 swash, has_user_defined_property);
15073 *flagp |= HASWIDTH|SIMPLE;
15075 if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
15076 RExC_contains_locale = 1;
15082 #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
15085 S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state,
15086 regnode* const node,
15088 SV* const runtime_defns,
15089 SV* const only_utf8_locale_list,
15091 const bool has_user_defined_property)
15093 /* Sets the arg field of an ANYOF-type node 'node', using information about
15094 * the node passed-in. If there is nothing outside the node's bitmap, the
15095 * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to
15096 * the count returned by add_data(), having allocated and stored an array,
15097 * av, that that count references, as follows:
15098 * av[0] stores the character class description in its textual form.
15099 * This is used later (regexec.c:Perl_regclass_swash()) to
15100 * initialize the appropriate swash, and is also useful for dumping
15101 * the regnode. This is set to &PL_sv_undef if the textual
15102 * description is not needed at run-time (as happens if the other
15103 * elements completely define the class)
15104 * av[1] if &PL_sv_undef, is a placeholder to later contain the swash
15105 * computed from av[0]. But if no further computation need be done,
15106 * the swash is stored here now (and av[0] is &PL_sv_undef).
15107 * av[2] stores the inversion list of code points that match only if the
15108 * current locale is UTF-8
15109 * av[3] stores the cp_list inversion list for use in addition or instead
15110 * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
15111 * (Otherwise everything needed is already in av[0] and av[1])
15112 * av[4] is set if any component of the class is from a user-defined
15113 * property; used only if av[3] exists */
15117 PERL_ARGS_ASSERT_SET_ANYOF_ARG;
15119 if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
15120 assert(! (ANYOF_FLAGS(node)
15121 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15122 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
15123 ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
15126 AV * const av = newAV();
15129 assert(ANYOF_FLAGS(node)
15130 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15131 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15133 av_store(av, 0, (runtime_defns)
15134 ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
15137 av_store(av, 1, swash);
15138 SvREFCNT_dec_NN(cp_list);
15141 av_store(av, 1, &PL_sv_undef);
15143 av_store(av, 3, cp_list);
15144 av_store(av, 4, newSVuv(has_user_defined_property));
15148 if (only_utf8_locale_list) {
15149 av_store(av, 2, only_utf8_locale_list);
15152 av_store(av, 2, &PL_sv_undef);
15155 rv = newRV_noinc(MUTABLE_SV(av));
15156 n = add_data(pRExC_state, STR_WITH_LEN("s"));
15157 RExC_rxi->data->data[n] = (void*)rv;
15162 #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
15164 Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
15165 const regnode* node,
15168 SV** only_utf8_locale_ptr,
15172 /* For internal core use only.
15173 * Returns the swash for the input 'node' in the regex 'prog'.
15174 * If <doinit> is 'true', will attempt to create the swash if not already
15176 * If <listsvp> is non-null, will return the printable contents of the
15177 * swash. This can be used to get debugging information even before the
15178 * swash exists, by calling this function with 'doinit' set to false, in
15179 * which case the components that will be used to eventually create the
15180 * swash are returned (in a printable form).
15181 * If <exclude_list> is not NULL, it is an inversion list of things to
15182 * exclude from what's returned in <listsvp>.
15183 * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
15184 * that, in spite of this function's name, the swash it returns may include
15185 * the bitmap data as well */
15188 SV *si = NULL; /* Input swash initialization string */
15189 SV* invlist = NULL;
15191 RXi_GET_DECL(prog,progi);
15192 const struct reg_data * const data = prog ? progi->data : NULL;
15194 PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
15196 assert(ANYOF_FLAGS(node)
15197 & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
15198 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
15200 if (data && data->count) {
15201 const U32 n = ARG(node);
15203 if (data->what[n] == 's') {
15204 SV * const rv = MUTABLE_SV(data->data[n]);
15205 AV * const av = MUTABLE_AV(SvRV(rv));
15206 SV **const ary = AvARRAY(av);
15207 U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
15209 si = *ary; /* ary[0] = the string to initialize the swash with */
15211 /* Elements 3 and 4 are either both present or both absent. [3] is
15212 * any inversion list generated at compile time; [4] indicates if
15213 * that inversion list has any user-defined properties in it. */
15214 if (av_tindex(av) >= 2) {
15215 if (only_utf8_locale_ptr
15217 && ary[2] != &PL_sv_undef)
15219 *only_utf8_locale_ptr = ary[2];
15222 assert(only_utf8_locale_ptr);
15223 *only_utf8_locale_ptr = NULL;
15226 if (av_tindex(av) >= 3) {
15228 if (SvUV(ary[4])) {
15229 swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
15237 /* Element [1] is reserved for the set-up swash. If already there,
15238 * return it; if not, create it and store it there */
15239 if (ary[1] && SvROK(ary[1])) {
15242 else if (doinit && ((si && si != &PL_sv_undef)
15243 || (invlist && invlist != &PL_sv_undef))) {
15245 sw = _core_swash_init("utf8", /* the utf8 package */
15249 0, /* not from tr/// */
15251 &swash_init_flags);
15252 (void)av_store(av, 1, sw);
15257 /* If requested, return a printable version of what this swash matches */
15259 SV* matches_string = newSVpvs("");
15261 /* The swash should be used, if possible, to get the data, as it
15262 * contains the resolved data. But this function can be called at
15263 * compile-time, before everything gets resolved, in which case we
15264 * return the currently best available information, which is the string
15265 * that will eventually be used to do that resolving, 'si' */
15266 if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
15267 && (si && si != &PL_sv_undef))
15269 sv_catsv(matches_string, si);
15272 /* Add the inversion list to whatever we have. This may have come from
15273 * the swash, or from an input parameter */
15275 if (exclude_list) {
15276 SV* clone = invlist_clone(invlist);
15277 _invlist_subtract(clone, exclude_list, &clone);
15278 sv_catsv(matches_string, _invlist_contents(clone));
15279 SvREFCNT_dec_NN(clone);
15282 sv_catsv(matches_string, _invlist_contents(invlist));
15285 *listsvp = matches_string;
15290 #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
15292 /* reg_skipcomment()
15294 Absorbs an /x style # comment from the input stream,
15295 returning a pointer to the first character beyond the comment, or if the
15296 comment terminates the pattern without anything following it, this returns
15297 one past the final character of the pattern (in other words, RExC_end) and
15298 sets the REG_RUN_ON_COMMENT_SEEN flag.
15300 Note it's the callers responsibility to ensure that we are
15301 actually in /x mode
15305 PERL_STATIC_INLINE char*
15306 S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
15308 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
15312 while (p < RExC_end) {
15313 if (*(++p) == '\n') {
15318 /* we ran off the end of the pattern without ending the comment, so we have
15319 * to add an \n when wrapping */
15320 RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
15326 Advances the parse position, and optionally absorbs
15327 "whitespace" from the inputstream.
15329 Without /x "whitespace" means (?#...) style comments only,
15330 with /x this means (?#...) and # comments and whitespace proper.
15332 Returns the RExC_parse point from BEFORE the scan occurs.
15334 This is the /x friendly way of saying RExC_parse++.
15338 S_nextchar(pTHX_ RExC_state_t *pRExC_state)
15340 char* const retval = RExC_parse++;
15342 PERL_ARGS_ASSERT_NEXTCHAR;
15345 if (RExC_end - RExC_parse >= 3
15346 && *RExC_parse == '('
15347 && RExC_parse[1] == '?'
15348 && RExC_parse[2] == '#')
15350 while (*RExC_parse != ')') {
15351 if (RExC_parse == RExC_end)
15352 FAIL("Sequence (?#... not terminated");
15358 if (RExC_flags & RXf_PMf_EXTENDED) {
15359 char * p = regpatws(pRExC_state, RExC_parse,
15360 TRUE); /* means recognize comments */
15361 if (p != RExC_parse) {
15371 - reg_node - emit a node
15373 STATIC regnode * /* Location. */
15374 S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
15377 regnode * const ret = RExC_emit;
15378 GET_RE_DEBUG_FLAGS_DECL;
15380 PERL_ARGS_ASSERT_REG_NODE;
15383 SIZE_ALIGN(RExC_size);
15387 if (RExC_emit >= RExC_emit_bound)
15388 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15389 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15391 NODE_ALIGN_FILL(ret);
15393 FILL_ADVANCE_NODE(ptr, op);
15394 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 1);
15395 #ifdef RE_TRACK_PATTERN_OFFSETS
15396 if (RExC_offsets) { /* MJD */
15398 ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
15399 "reg_node", __LINE__,
15401 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
15402 ? "Overwriting end of array!\n" : "OK",
15403 (UV)(RExC_emit - RExC_emit_start),
15404 (UV)(RExC_parse - RExC_start),
15405 (UV)RExC_offsets[0]));
15406 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
15414 - reganode - emit a node with an argument
15416 STATIC regnode * /* Location. */
15417 S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
15420 regnode * const ret = RExC_emit;
15421 GET_RE_DEBUG_FLAGS_DECL;
15423 PERL_ARGS_ASSERT_REGANODE;
15426 SIZE_ALIGN(RExC_size);
15431 assert(2==regarglen[op]+1);
15433 Anything larger than this has to allocate the extra amount.
15434 If we changed this to be:
15436 RExC_size += (1 + regarglen[op]);
15438 then it wouldn't matter. Its not clear what side effect
15439 might come from that so its not done so far.
15444 if (RExC_emit >= RExC_emit_bound)
15445 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
15446 op, (void*)RExC_emit, (void*)RExC_emit_bound);
15448 NODE_ALIGN_FILL(ret);
15450 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
15451 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (ptr) - 2);
15452 #ifdef RE_TRACK_PATTERN_OFFSETS
15453 if (RExC_offsets) { /* MJD */
15455 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15459 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
15460 "Overwriting end of array!\n" : "OK",
15461 (UV)(RExC_emit - RExC_emit_start),
15462 (UV)(RExC_parse - RExC_start),
15463 (UV)RExC_offsets[0]));
15464 Set_Cur_Node_Offset;
15472 - reguni - emit (if appropriate) a Unicode character
15474 PERL_STATIC_INLINE STRLEN
15475 S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
15477 PERL_ARGS_ASSERT_REGUNI;
15479 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
15483 - reginsert - insert an operator in front of already-emitted operand
15485 * Means relocating the operand.
15488 S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
15493 const int offset = regarglen[(U8)op];
15494 const int size = NODE_STEP_REGNODE + offset;
15495 GET_RE_DEBUG_FLAGS_DECL;
15497 PERL_ARGS_ASSERT_REGINSERT;
15498 PERL_UNUSED_CONTEXT;
15499 PERL_UNUSED_ARG(depth);
15500 /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
15501 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
15510 if (RExC_open_parens) {
15512 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
15513 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
15514 if ( RExC_open_parens[paren] >= opnd ) {
15515 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
15516 RExC_open_parens[paren] += size;
15518 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
15520 if ( RExC_close_parens[paren] >= opnd ) {
15521 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
15522 RExC_close_parens[paren] += size;
15524 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
15529 while (src > opnd) {
15530 StructCopy(--src, --dst, regnode);
15531 #ifdef RE_TRACK_PATTERN_OFFSETS
15532 if (RExC_offsets) { /* MJD 20010112 */
15534 ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
15538 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
15539 ? "Overwriting end of array!\n" : "OK",
15540 (UV)(src - RExC_emit_start),
15541 (UV)(dst - RExC_emit_start),
15542 (UV)RExC_offsets[0]));
15543 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
15544 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
15550 place = opnd; /* Op node, where operand used to be. */
15551 #ifdef RE_TRACK_PATTERN_OFFSETS
15552 if (RExC_offsets) { /* MJD */
15554 ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
15558 (UV)(place - RExC_emit_start) > RExC_offsets[0]
15559 ? "Overwriting end of array!\n" : "OK",
15560 (UV)(place - RExC_emit_start),
15561 (UV)(RExC_parse - RExC_start),
15562 (UV)RExC_offsets[0]));
15563 Set_Node_Offset(place, RExC_parse);
15564 Set_Node_Length(place, 1);
15567 src = NEXTOPER(place);
15568 FILL_ADVANCE_NODE(place, op);
15569 REH_CALL_COMP_NODE_HOOK(pRExC_state->rx, (place) - 1);
15570 Zero(src, offset, regnode);
15574 - regtail - set the next-pointer at the end of a node chain of p to val.
15575 - SEE ALSO: regtail_study
15577 /* TODO: All three parms should be const */
15579 S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15580 const regnode *val,U32 depth)
15583 GET_RE_DEBUG_FLAGS_DECL;
15585 PERL_ARGS_ASSERT_REGTAIL;
15587 PERL_UNUSED_ARG(depth);
15593 /* Find last node. */
15596 regnode * const temp = regnext(scan);
15598 SV * const mysv=sv_newmortal();
15599 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
15600 regprop(RExC_rx, mysv, scan, NULL);
15601 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
15602 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
15603 (temp == NULL ? "->" : ""),
15604 (temp == NULL ? PL_reg_name[OP(val)] : "")
15612 if (reg_off_by_arg[OP(scan)]) {
15613 ARG_SET(scan, val - scan);
15616 NEXT_OFF(scan) = val - scan;
15622 - regtail_study - set the next-pointer at the end of a node chain of p to val.
15623 - Look for optimizable sequences at the same time.
15624 - currently only looks for EXACT chains.
15626 This is experimental code. The idea is to use this routine to perform
15627 in place optimizations on branches and groups as they are constructed,
15628 with the long term intention of removing optimization from study_chunk so
15629 that it is purely analytical.
15631 Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
15632 to control which is which.
15635 /* TODO: All four parms should be const */
15638 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
15639 const regnode *val,U32 depth)
15643 #ifdef EXPERIMENTAL_INPLACESCAN
15646 GET_RE_DEBUG_FLAGS_DECL;
15648 PERL_ARGS_ASSERT_REGTAIL_STUDY;
15654 /* Find last node. */
15658 regnode * const temp = regnext(scan);
15659 #ifdef EXPERIMENTAL_INPLACESCAN
15660 if (PL_regkind[OP(scan)] == EXACT) {
15661 bool unfolded_multi_char; /* Unexamined in this routine */
15662 if (join_exact(pRExC_state, scan, &min,
15663 &unfolded_multi_char, 1, val, depth+1))
15668 switch (OP(scan)) {
15671 case EXACTFA_NO_TRIE:
15676 if( exact == PSEUDO )
15678 else if ( exact != OP(scan) )
15687 SV * const mysv=sv_newmortal();
15688 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
15689 regprop(RExC_rx, mysv, scan, NULL);
15690 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
15691 SvPV_nolen_const(mysv),
15692 REG_NODE_NUM(scan),
15693 PL_reg_name[exact]);
15700 SV * const mysv_val=sv_newmortal();
15701 DEBUG_PARSE_MSG("");
15702 regprop(RExC_rx, mysv_val, val, NULL);
15703 PerlIO_printf(Perl_debug_log,
15704 "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
15705 SvPV_nolen_const(mysv_val),
15706 (IV)REG_NODE_NUM(val),
15710 if (reg_off_by_arg[OP(scan)]) {
15711 ARG_SET(scan, val - scan);
15714 NEXT_OFF(scan) = val - scan;
15722 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
15727 S_regdump_intflags(pTHX_ const char *lead, const U32 flags)
15732 ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15734 for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
15735 if (flags & (1<<bit)) {
15736 if (!set++ && lead)
15737 PerlIO_printf(Perl_debug_log, "%s",lead);
15738 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]);
15743 PerlIO_printf(Perl_debug_log, "\n");
15745 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15750 S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
15756 ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
15758 for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
15759 if (flags & (1<<bit)) {
15760 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
15763 if (!set++ && lead)
15764 PerlIO_printf(Perl_debug_log, "%s",lead);
15765 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
15768 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
15769 if (!set++ && lead) {
15770 PerlIO_printf(Perl_debug_log, "%s",lead);
15773 case REGEX_UNICODE_CHARSET:
15774 PerlIO_printf(Perl_debug_log, "UNICODE");
15776 case REGEX_LOCALE_CHARSET:
15777 PerlIO_printf(Perl_debug_log, "LOCALE");
15779 case REGEX_ASCII_RESTRICTED_CHARSET:
15780 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
15782 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
15783 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
15786 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
15792 PerlIO_printf(Perl_debug_log, "\n");
15794 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
15800 Perl_regdump(pTHX_ const regexp *r)
15803 SV * const sv = sv_newmortal();
15804 SV *dsv= sv_newmortal();
15805 RXi_GET_DECL(r,ri);
15806 GET_RE_DEBUG_FLAGS_DECL;
15808 PERL_ARGS_ASSERT_REGDUMP;
15810 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
15812 /* Header fields of interest. */
15813 if (r->anchored_substr) {
15814 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
15815 RE_SV_DUMPLEN(r->anchored_substr), 30);
15816 PerlIO_printf(Perl_debug_log,
15817 "anchored %s%s at %"IVdf" ",
15818 s, RE_SV_TAIL(r->anchored_substr),
15819 (IV)r->anchored_offset);
15820 } else if (r->anchored_utf8) {
15821 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
15822 RE_SV_DUMPLEN(r->anchored_utf8), 30);
15823 PerlIO_printf(Perl_debug_log,
15824 "anchored utf8 %s%s at %"IVdf" ",
15825 s, RE_SV_TAIL(r->anchored_utf8),
15826 (IV)r->anchored_offset);
15828 if (r->float_substr) {
15829 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
15830 RE_SV_DUMPLEN(r->float_substr), 30);
15831 PerlIO_printf(Perl_debug_log,
15832 "floating %s%s at %"IVdf"..%"UVuf" ",
15833 s, RE_SV_TAIL(r->float_substr),
15834 (IV)r->float_min_offset, (UV)r->float_max_offset);
15835 } else if (r->float_utf8) {
15836 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
15837 RE_SV_DUMPLEN(r->float_utf8), 30);
15838 PerlIO_printf(Perl_debug_log,
15839 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
15840 s, RE_SV_TAIL(r->float_utf8),
15841 (IV)r->float_min_offset, (UV)r->float_max_offset);
15843 if (r->check_substr || r->check_utf8)
15844 PerlIO_printf(Perl_debug_log,
15846 (r->check_substr == r->float_substr
15847 && r->check_utf8 == r->float_utf8
15848 ? "(checking floating" : "(checking anchored"));
15849 if (r->intflags & PREGf_NOSCAN)
15850 PerlIO_printf(Perl_debug_log, " noscan");
15851 if (r->extflags & RXf_CHECK_ALL)
15852 PerlIO_printf(Perl_debug_log, " isall");
15853 if (r->check_substr || r->check_utf8)
15854 PerlIO_printf(Perl_debug_log, ") ");
15856 if (ri->regstclass) {
15857 regprop(r, sv, ri->regstclass, NULL);
15858 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
15860 if (r->intflags & PREGf_ANCH) {
15861 PerlIO_printf(Perl_debug_log, "anchored");
15862 if (r->intflags & PREGf_ANCH_MBOL)
15863 PerlIO_printf(Perl_debug_log, "(MBOL)");
15864 if (r->intflags & PREGf_ANCH_SBOL)
15865 PerlIO_printf(Perl_debug_log, "(SBOL)");
15866 if (r->intflags & PREGf_ANCH_GPOS)
15867 PerlIO_printf(Perl_debug_log, "(GPOS)");
15868 PerlIO_putc(Perl_debug_log, ' ');
15870 if (r->intflags & PREGf_GPOS_SEEN)
15871 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
15872 if (r->intflags & PREGf_SKIP)
15873 PerlIO_printf(Perl_debug_log, "plus ");
15874 if (r->intflags & PREGf_IMPLICIT)
15875 PerlIO_printf(Perl_debug_log, "implicit ");
15876 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
15877 if (r->extflags & RXf_EVAL_SEEN)
15878 PerlIO_printf(Perl_debug_log, "with eval ");
15879 PerlIO_printf(Perl_debug_log, "\n");
15881 regdump_extflags("r->extflags: ",r->extflags);
15882 regdump_intflags("r->intflags: ",r->intflags);
15885 PERL_ARGS_ASSERT_REGDUMP;
15886 PERL_UNUSED_CONTEXT;
15887 PERL_UNUSED_ARG(r);
15888 #endif /* DEBUGGING */
15892 - regprop - printable representation of opcode, with run time support
15896 Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
15901 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
15902 static const char * const anyofs[] = {
15903 #if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \
15904 || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \
15905 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \
15906 || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \
15907 || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \
15908 || _CC_VERTSPACE != 16
15909 #error Need to adjust order of anyofs[]
15946 RXi_GET_DECL(prog,progi);
15947 GET_RE_DEBUG_FLAGS_DECL;
15949 PERL_ARGS_ASSERT_REGPROP;
15953 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
15954 /* It would be nice to FAIL() here, but this may be called from
15955 regexec.c, and it would be hard to supply pRExC_state. */
15956 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
15957 (int)OP(o), (int)REGNODE_MAX);
15958 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
15960 k = PL_regkind[OP(o)];
15963 sv_catpvs(sv, " ");
15964 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
15965 * is a crude hack but it may be the best for now since
15966 * we have no flag "this EXACTish node was UTF-8"
15968 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
15969 PERL_PV_ESCAPE_UNI_DETECT |
15970 PERL_PV_ESCAPE_NONASCII |
15971 PERL_PV_PRETTY_ELLIPSES |
15972 PERL_PV_PRETTY_LTGT |
15973 PERL_PV_PRETTY_NOCLEAR
15975 } else if (k == TRIE) {
15976 /* print the details of the trie in dumpuntil instead, as
15977 * progi->data isn't available here */
15978 const char op = OP(o);
15979 const U32 n = ARG(o);
15980 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
15981 (reg_ac_data *)progi->data->data[n] :
15983 const reg_trie_data * const trie
15984 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
15986 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
15987 DEBUG_TRIE_COMPILE_r(
15988 Perl_sv_catpvf(aTHX_ sv,
15989 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
15990 (UV)trie->startstate,
15991 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
15992 (UV)trie->wordcount,
15995 (UV)TRIE_CHARCOUNT(trie),
15996 (UV)trie->uniquecharcount
15999 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
16000 sv_catpvs(sv, "[");
16001 (void) put_charclass_bitmap_innards(sv,
16002 (IS_ANYOF_TRIE(op))
16004 : TRIE_BITMAP(trie),
16006 sv_catpvs(sv, "]");
16009 } else if (k == CURLY) {
16010 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
16011 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
16012 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
16014 else if (k == WHILEM && o->flags) /* Ordinal/of */
16015 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
16016 else if (k == REF || k == OPEN || k == CLOSE
16017 || k == GROUPP || OP(o)==ACCEPT)
16019 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
16020 if ( RXp_PAREN_NAMES(prog) ) {
16021 if ( k != REF || (OP(o) < NREF)) {
16022 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
16023 SV **name= av_fetch(list, ARG(o), 0 );
16025 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16028 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
16029 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
16030 I32 *nums=(I32*)SvPVX(sv_dat);
16031 SV **name= av_fetch(list, nums[0], 0 );
16034 for ( n=0; n<SvIVX(sv_dat); n++ ) {
16035 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
16036 (n ? "," : ""), (IV)nums[n]);
16038 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
16042 if ( k == REF && reginfo) {
16043 U32 n = ARG(o); /* which paren pair */
16044 I32 ln = prog->offs[n].start;
16045 if (prog->lastparen < n || ln == -1)
16046 Perl_sv_catpvf(aTHX_ sv, ": FAIL");
16047 else if (ln == prog->offs[n].end)
16048 Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
16050 const char *s = reginfo->strbeg + ln;
16051 Perl_sv_catpvf(aTHX_ sv, ": ");
16052 Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
16053 PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
16056 } else if (k == GOSUB)
16057 /* Paren and offset */
16058 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
16059 else if (k == VERB) {
16061 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
16062 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
16063 } else if (k == LOGICAL)
16064 /* 2: embedded, otherwise 1 */
16065 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
16066 else if (k == ANYOF) {
16067 const U8 flags = ANYOF_FLAGS(o);
16069 SV* bitmap_invlist; /* Will hold what the bit map contains */
16072 if (flags & ANYOF_LOCALE_FLAGS)
16073 sv_catpvs(sv, "{loc}");
16074 if (flags & ANYOF_LOC_FOLD)
16075 sv_catpvs(sv, "{i}");
16076 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
16077 if (flags & ANYOF_INVERT)
16078 sv_catpvs(sv, "^");
16080 /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
16082 do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
16085 /* output any special charclass tests (used entirely under use
16087 if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
16089 for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
16090 if (ANYOF_POSIXL_TEST(o,i)) {
16091 sv_catpv(sv, anyofs[i]);
16097 if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
16098 |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
16099 |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
16103 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
16104 if (flags & ANYOF_INVERT)
16105 /*make sure the invert info is in each */
16106 sv_catpvs(sv, "^");
16109 if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
16110 sv_catpvs(sv, "{non-utf8-latin1-all}");
16113 /* output information about the unicode matching */
16114 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
16115 sv_catpvs(sv, "{above_bitmap_all}");
16116 else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
16117 SV *lv; /* Set if there is something outside the bit map. */
16118 bool byte_output = FALSE; /* If something in the bitmap has
16120 SV *only_utf8_locale;
16122 /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
16123 * is used to guarantee that nothing in the bitmap gets
16125 (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
16126 &lv, &only_utf8_locale,
16128 if (lv && lv != &PL_sv_undef) {
16129 char *s = savesvpv(lv);
16130 char * const origs = s;
16132 while (*s && *s != '\n')
16136 const char * const t = ++s;
16138 if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
16139 sv_catpvs(sv, "{outside bitmap}");
16142 sv_catpvs(sv, "{utf8}");
16146 sv_catpvs(sv, " ");
16152 /* Truncate very long output */
16153 if (s - origs > 256) {
16154 Perl_sv_catpvf(aTHX_ sv,
16156 (int) (s - origs - 1),
16162 else if (*s == '\t') {
16176 SvREFCNT_dec_NN(lv);
16179 if ((flags & ANYOF_LOC_FOLD)
16180 && only_utf8_locale
16181 && only_utf8_locale != &PL_sv_undef)
16184 int max_entries = 256;
16186 sv_catpvs(sv, "{utf8 locale}");
16187 invlist_iterinit(only_utf8_locale);
16188 while (invlist_iternext(only_utf8_locale,
16190 put_range(sv, start, end, FALSE);
16192 if (max_entries < 0) {
16193 sv_catpvs(sv, "...");
16197 invlist_iterfinish(only_utf8_locale);
16201 SvREFCNT_dec(bitmap_invlist);
16204 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
16206 else if (k == POSIXD || k == NPOSIXD) {
16207 U8 index = FLAGS(o) * 2;
16208 if (index < C_ARRAY_LENGTH(anyofs)) {
16209 if (*anyofs[index] != '[') {
16212 sv_catpv(sv, anyofs[index]);
16213 if (*anyofs[index] != '[') {
16218 Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
16221 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
16222 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
16223 else if (OP(o) == SBOL)
16224 Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
16226 PERL_UNUSED_CONTEXT;
16227 PERL_UNUSED_ARG(sv);
16228 PERL_UNUSED_ARG(o);
16229 PERL_UNUSED_ARG(prog);
16230 PERL_UNUSED_ARG(reginfo);
16231 #endif /* DEBUGGING */
16237 Perl_re_intuit_string(pTHX_ REGEXP * const r)
16238 { /* Assume that RE_INTUIT is set */
16239 struct regexp *const prog = ReANY(r);
16240 GET_RE_DEBUG_FLAGS_DECL;
16242 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
16243 PERL_UNUSED_CONTEXT;
16247 const char * const s = SvPV_nolen_const(prog->check_substr
16248 ? prog->check_substr : prog->check_utf8);
16250 if (!PL_colorset) reginitcolors();
16251 PerlIO_printf(Perl_debug_log,
16252 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
16254 prog->check_substr ? "" : "utf8 ",
16255 PL_colors[5],PL_colors[0],
16258 (strlen(s) > 60 ? "..." : ""));
16261 return prog->check_substr ? prog->check_substr : prog->check_utf8;
16267 handles refcounting and freeing the perl core regexp structure. When
16268 it is necessary to actually free the structure the first thing it
16269 does is call the 'free' method of the regexp_engine associated to
16270 the regexp, allowing the handling of the void *pprivate; member
16271 first. (This routine is not overridable by extensions, which is why
16272 the extensions free is called first.)
16274 See regdupe and regdupe_internal if you change anything here.
16276 #ifndef PERL_IN_XSUB_RE
16278 Perl_pregfree(pTHX_ REGEXP *r)
16284 Perl_pregfree2(pTHX_ REGEXP *rx)
16286 struct regexp *const r = ReANY(rx);
16287 GET_RE_DEBUG_FLAGS_DECL;
16289 PERL_ARGS_ASSERT_PREGFREE2;
16291 if (r->mother_re) {
16292 ReREFCNT_dec(r->mother_re);
16294 CALLREGFREE_PVT(rx); /* free the private data */
16295 SvREFCNT_dec(RXp_PAREN_NAMES(r));
16296 Safefree(r->xpv_len_u.xpvlenu_pv);
16299 SvREFCNT_dec(r->anchored_substr);
16300 SvREFCNT_dec(r->anchored_utf8);
16301 SvREFCNT_dec(r->float_substr);
16302 SvREFCNT_dec(r->float_utf8);
16303 Safefree(r->substrs);
16305 RX_MATCH_COPY_FREE(rx);
16306 #ifdef PERL_ANY_COW
16307 SvREFCNT_dec(r->saved_copy);
16310 SvREFCNT_dec(r->qr_anoncv);
16311 rx->sv_u.svu_rx = 0;
16316 This is a hacky workaround to the structural issue of match results
16317 being stored in the regexp structure which is in turn stored in
16318 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
16319 could be PL_curpm in multiple contexts, and could require multiple
16320 result sets being associated with the pattern simultaneously, such
16321 as when doing a recursive match with (??{$qr})
16323 The solution is to make a lightweight copy of the regexp structure
16324 when a qr// is returned from the code executed by (??{$qr}) this
16325 lightweight copy doesn't actually own any of its data except for
16326 the starp/end and the actual regexp structure itself.
16332 Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
16334 struct regexp *ret;
16335 struct regexp *const r = ReANY(rx);
16336 const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
16338 PERL_ARGS_ASSERT_REG_TEMP_COPY;
16341 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
16343 SvOK_off((SV *)ret_x);
16345 /* For PVLVs, SvANY points to the xpvlv body while sv_u points
16346 to the regexp. (For SVt_REGEXPs, sv_upgrade has already
16347 made both spots point to the same regexp body.) */
16348 REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
16349 assert(!SvPVX(ret_x));
16350 ret_x->sv_u.svu_rx = temp->sv_any;
16351 temp->sv_any = NULL;
16352 SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
16353 SvREFCNT_dec_NN(temp);
16354 /* SvCUR still resides in the xpvlv struct, so the regexp copy-
16355 ing below will not set it. */
16356 SvCUR_set(ret_x, SvCUR(rx));
16359 /* This ensures that SvTHINKFIRST(sv) is true, and hence that
16360 sv_force_normal(sv) is called. */
16362 ret = ReANY(ret_x);
16364 SvFLAGS(ret_x) |= SvUTF8(rx);
16365 /* We share the same string buffer as the original regexp, on which we
16366 hold a reference count, incremented when mother_re is set below.
16367 The string pointer is copied here, being part of the regexp struct.
16369 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
16370 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
16372 const I32 npar = r->nparens+1;
16373 Newx(ret->offs, npar, regexp_paren_pair);
16374 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16377 Newx(ret->substrs, 1, struct reg_substr_data);
16378 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16380 SvREFCNT_inc_void(ret->anchored_substr);
16381 SvREFCNT_inc_void(ret->anchored_utf8);
16382 SvREFCNT_inc_void(ret->float_substr);
16383 SvREFCNT_inc_void(ret->float_utf8);
16385 /* check_substr and check_utf8, if non-NULL, point to either their
16386 anchored or float namesakes, and don't hold a second reference. */
16388 RX_MATCH_COPIED_off(ret_x);
16389 #ifdef PERL_ANY_COW
16390 ret->saved_copy = NULL;
16392 ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
16393 SvREFCNT_inc_void(ret->qr_anoncv);
16399 /* regfree_internal()
16401 Free the private data in a regexp. This is overloadable by
16402 extensions. Perl takes care of the regexp structure in pregfree(),
16403 this covers the *pprivate pointer which technically perl doesn't
16404 know about, however of course we have to handle the
16405 regexp_internal structure when no extension is in use.
16407 Note this is called before freeing anything in the regexp
16412 Perl_regfree_internal(pTHX_ REGEXP * const rx)
16414 struct regexp *const r = ReANY(rx);
16415 RXi_GET_DECL(r,ri);
16416 GET_RE_DEBUG_FLAGS_DECL;
16418 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
16424 SV *dsv= sv_newmortal();
16425 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
16426 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
16427 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
16428 PL_colors[4],PL_colors[5],s);
16431 #ifdef RE_TRACK_PATTERN_OFFSETS
16433 Safefree(ri->u.offsets); /* 20010421 MJD */
16435 if (ri->code_blocks) {
16437 for (n = 0; n < ri->num_code_blocks; n++)
16438 SvREFCNT_dec(ri->code_blocks[n].src_regex);
16439 Safefree(ri->code_blocks);
16443 int n = ri->data->count;
16446 /* If you add a ->what type here, update the comment in regcomp.h */
16447 switch (ri->data->what[n]) {
16453 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
16456 Safefree(ri->data->data[n]);
16462 { /* Aho Corasick add-on structure for a trie node.
16463 Used in stclass optimization only */
16465 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
16466 #ifdef USE_ITHREADS
16470 refcount = --aho->refcount;
16473 PerlMemShared_free(aho->states);
16474 PerlMemShared_free(aho->fail);
16475 /* do this last!!!! */
16476 PerlMemShared_free(ri->data->data[n]);
16477 /* we should only ever get called once, so
16478 * assert as much, and also guard the free
16479 * which /might/ happen twice. At the least
16480 * it will make code anlyzers happy and it
16481 * doesn't cost much. - Yves */
16482 assert(ri->regstclass);
16483 if (ri->regstclass) {
16484 PerlMemShared_free(ri->regstclass);
16485 ri->regstclass = 0;
16492 /* trie structure. */
16494 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
16495 #ifdef USE_ITHREADS
16499 refcount = --trie->refcount;
16502 PerlMemShared_free(trie->charmap);
16503 PerlMemShared_free(trie->states);
16504 PerlMemShared_free(trie->trans);
16506 PerlMemShared_free(trie->bitmap);
16508 PerlMemShared_free(trie->jump);
16509 PerlMemShared_free(trie->wordinfo);
16510 /* do this last!!!! */
16511 PerlMemShared_free(ri->data->data[n]);
16516 Perl_croak(aTHX_ "panic: regfree data code '%c'",
16517 ri->data->what[n]);
16520 Safefree(ri->data->what);
16521 Safefree(ri->data);
16527 #define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
16528 #define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
16529 #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
16532 re_dup - duplicate a regexp.
16534 This routine is expected to clone a given regexp structure. It is only
16535 compiled under USE_ITHREADS.
16537 After all of the core data stored in struct regexp is duplicated
16538 the regexp_engine.dupe method is used to copy any private data
16539 stored in the *pprivate pointer. This allows extensions to handle
16540 any duplication it needs to do.
16542 See pregfree() and regfree_internal() if you change anything here.
16544 #if defined(USE_ITHREADS)
16545 #ifndef PERL_IN_XSUB_RE
16547 Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
16551 const struct regexp *r = ReANY(sstr);
16552 struct regexp *ret = ReANY(dstr);
16554 PERL_ARGS_ASSERT_RE_DUP_GUTS;
16556 npar = r->nparens+1;
16557 Newx(ret->offs, npar, regexp_paren_pair);
16558 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
16560 if (ret->substrs) {
16561 /* Do it this way to avoid reading from *r after the StructCopy().
16562 That way, if any of the sv_dup_inc()s dislodge *r from the L1
16563 cache, it doesn't matter. */
16564 const bool anchored = r->check_substr
16565 ? r->check_substr == r->anchored_substr
16566 : r->check_utf8 == r->anchored_utf8;
16567 Newx(ret->substrs, 1, struct reg_substr_data);
16568 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
16570 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
16571 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
16572 ret->float_substr = sv_dup_inc(ret->float_substr, param);
16573 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
16575 /* check_substr and check_utf8, if non-NULL, point to either their
16576 anchored or float namesakes, and don't hold a second reference. */
16578 if (ret->check_substr) {
16580 assert(r->check_utf8 == r->anchored_utf8);
16581 ret->check_substr = ret->anchored_substr;
16582 ret->check_utf8 = ret->anchored_utf8;
16584 assert(r->check_substr == r->float_substr);
16585 assert(r->check_utf8 == r->float_utf8);
16586 ret->check_substr = ret->float_substr;
16587 ret->check_utf8 = ret->float_utf8;
16589 } else if (ret->check_utf8) {
16591 ret->check_utf8 = ret->anchored_utf8;
16593 ret->check_utf8 = ret->float_utf8;
16598 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
16599 ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
16602 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
16604 if (RX_MATCH_COPIED(dstr))
16605 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
16607 ret->subbeg = NULL;
16608 #ifdef PERL_ANY_COW
16609 ret->saved_copy = NULL;
16612 /* Whether mother_re be set or no, we need to copy the string. We
16613 cannot refrain from copying it when the storage points directly to
16614 our mother regexp, because that's
16615 1: a buffer in a different thread
16616 2: something we no longer hold a reference on
16617 so we need to copy it locally. */
16618 RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
16619 ret->mother_re = NULL;
16621 #endif /* PERL_IN_XSUB_RE */
16626 This is the internal complement to regdupe() which is used to copy
16627 the structure pointed to by the *pprivate pointer in the regexp.
16628 This is the core version of the extension overridable cloning hook.
16629 The regexp structure being duplicated will be copied by perl prior
16630 to this and will be provided as the regexp *r argument, however
16631 with the /old/ structures pprivate pointer value. Thus this routine
16632 may override any copying normally done by perl.
16634 It returns a pointer to the new regexp_internal structure.
16638 Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
16641 struct regexp *const r = ReANY(rx);
16642 regexp_internal *reti;
16644 RXi_GET_DECL(r,ri);
16646 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
16650 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode),
16651 char, regexp_internal);
16652 Copy(ri->program, reti->program, len+1, regnode);
16654 reti->num_code_blocks = ri->num_code_blocks;
16655 if (ri->code_blocks) {
16657 Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
16658 struct reg_code_block);
16659 Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
16660 struct reg_code_block);
16661 for (n = 0; n < ri->num_code_blocks; n++)
16662 reti->code_blocks[n].src_regex = (REGEXP*)
16663 sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
16666 reti->code_blocks = NULL;
16668 reti->regstclass = NULL;
16671 struct reg_data *d;
16672 const int count = ri->data->count;
16675 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
16676 char, struct reg_data);
16677 Newx(d->what, count, U8);
16680 for (i = 0; i < count; i++) {
16681 d->what[i] = ri->data->what[i];
16682 switch (d->what[i]) {
16683 /* see also regcomp.h and regfree_internal() */
16684 case 'a': /* actually an AV, but the dup function is identical. */
16688 case 'u': /* actually an HV, but the dup function is identical. */
16689 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
16692 /* This is cheating. */
16693 Newx(d->data[i], 1, regnode_ssc);
16694 StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
16695 reti->regstclass = (regnode*)d->data[i];
16698 /* Trie stclasses are readonly and can thus be shared
16699 * without duplication. We free the stclass in pregfree
16700 * when the corresponding reg_ac_data struct is freed.
16702 reti->regstclass= ri->regstclass;
16706 ((reg_trie_data*)ri->data->data[i])->refcount++;
16711 d->data[i] = ri->data->data[i];
16714 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'",
16715 ri->data->what[i]);
16724 reti->name_list_idx = ri->name_list_idx;
16726 #ifdef RE_TRACK_PATTERN_OFFSETS
16727 if (ri->u.offsets) {
16728 Newx(reti->u.offsets, 2*len+1, U32);
16729 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
16732 SetProgLen(reti,len);
16735 return (void*)reti;
16738 #endif /* USE_ITHREADS */
16740 #ifndef PERL_IN_XSUB_RE
16743 - regnext - dig the "next" pointer out of a node
16746 Perl_regnext(pTHX_ regnode *p)
16753 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
16754 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d",
16755 (int)OP(p), (int)REGNODE_MAX);
16758 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
16767 S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...)
16770 STRLEN l1 = strlen(pat1);
16771 STRLEN l2 = strlen(pat2);
16774 const char *message;
16776 PERL_ARGS_ASSERT_RE_CROAK2;
16782 Copy(pat1, buf, l1 , char);
16783 Copy(pat2, buf + l1, l2 , char);
16784 buf[l1 + l2] = '\n';
16785 buf[l1 + l2 + 1] = '\0';
16786 va_start(args, pat2);
16787 msv = vmess(buf, &args);
16789 message = SvPV_const(msv,l1);
16792 Copy(message, buf, l1 , char);
16793 /* l1-1 to avoid \n */
16794 Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
16798 /* Certain characters are output as a sequence with the first being a
16800 #define isBACKSLASHED_PUNCT(c) \
16801 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
16804 S_put_code_point(pTHX_ SV *sv, UV c)
16806 PERL_ARGS_ASSERT_PUT_CODE_POINT;
16809 Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c);
16811 else if (isPRINT(c)) {
16812 const char string = (char) c;
16813 if (isBACKSLASHED_PUNCT(c))
16814 sv_catpvs(sv, "\\");
16815 sv_catpvn(sv, &string, 1);
16818 const char * const mnemonic = cntrl_to_mnemonic((char) c);
16820 Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic);
16823 Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c);
16828 #define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
16831 #define MIN(a,b) ((a) < (b) ? (a) : (b))
16835 S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
16837 /* Appends to 'sv' a displayable version of the range of code points from
16838 * 'start' to 'end'. It assumes that only ASCII printables are displayable
16839 * as-is (though some of these will be escaped by put_code_point()). */
16841 const unsigned int min_range_count = 3;
16843 assert(start <= end);
16845 PERL_ARGS_ASSERT_PUT_RANGE;
16847 while (start <= end) {
16849 const char * format;
16851 if (end - start < min_range_count) {
16853 /* Individual chars in short ranges */
16854 for (; start <= end; start++) {
16855 put_code_point(sv, start);
16860 /* If permitted by the input options, and there is a possibility that
16861 * this range contains a printable literal, look to see if there is
16863 if (allow_literals && start <= MAX_PRINT_A) {
16865 /* If the range begin isn't an ASCII printable, effectively split
16866 * the range into two parts:
16867 * 1) the portion before the first such printable,
16869 * and output them separately. */
16870 if (! isPRINT_A(start)) {
16871 UV temp_end = start + 1;
16873 /* There is no point looking beyond the final possible
16874 * printable, in MAX_PRINT_A */
16875 UV max = MIN(end, MAX_PRINT_A);
16877 while (temp_end <= max && ! isPRINT_A(temp_end)) {
16881 /* Here, temp_end points to one beyond the first printable if
16882 * found, or to one beyond 'max' if not. If none found, make
16883 * sure that we use the entire range */
16884 if (temp_end > MAX_PRINT_A) {
16885 temp_end = end + 1;
16888 /* Output the first part of the split range, the part that
16889 * doesn't have printables, with no looking for literals
16890 * (otherwise we would infinitely recurse) */
16891 put_range(sv, start, temp_end - 1, FALSE);
16893 /* The 2nd part of the range (if any) starts here. */
16896 /* We continue instead of dropping down because even if the 2nd
16897 * part is non-empty, it could be so short that we want to
16898 * output it specially, as tested for at the top of this loop.
16903 /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
16904 * output a sub-range of just the digits or letters, then process
16905 * the remaining portion as usual. */
16906 if (isALPHANUMERIC_A(start)) {
16907 UV mask = (isDIGIT_A(start))
16912 UV temp_end = start + 1;
16914 /* Find the end of the sub-range that includes just the
16915 * characters in the same class as the first character in it */
16916 while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
16921 /* For short ranges, don't duplicate the code above to output
16922 * them; just call recursively */
16923 if (temp_end - start < min_range_count) {
16924 put_range(sv, start, temp_end, FALSE);
16926 else { /* Output as a range */
16927 put_code_point(sv, start);
16928 sv_catpvs(sv, "-");
16929 put_code_point(sv, temp_end);
16931 start = temp_end + 1;
16935 /* We output any other printables as individual characters */
16936 if (isPUNCT_A(start) || isSPACE_A(start)) {
16937 while (start <= end && (isPUNCT_A(start)
16938 || isSPACE_A(start)))
16940 put_code_point(sv, start);
16945 } /* End of looking for literals */
16947 /* Here is not to output as a literal. Some control characters have
16948 * mnemonic names. Split off any of those at the beginning and end of
16949 * the range to print mnemonically. It isn't possible for many of
16950 * these to be in a row, so this won't overwhelm with output */
16951 while (isMNEMONIC_CNTRL(start) && start <= end) {
16952 put_code_point(sv, start);
16955 if (start < end && isMNEMONIC_CNTRL(end)) {
16957 /* Here, the final character in the range has a mnemonic name.
16958 * Work backwards from the end to find the final non-mnemonic */
16959 UV temp_end = end - 1;
16960 while (isMNEMONIC_CNTRL(temp_end)) {
16964 /* And separately output the range that doesn't have mnemonics */
16965 put_range(sv, start, temp_end, FALSE);
16967 /* Then output the mnemonic trailing controls */
16968 start = temp_end + 1;
16969 while (start <= end) {
16970 put_code_point(sv, start);
16976 /* As a final resort, output the range or subrange as hex. */
16978 this_end = (end < NUM_ANYOF_CODE_POINTS)
16980 : NUM_ANYOF_CODE_POINTS - 1;
16981 format = (this_end < 256)
16982 ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
16983 : "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
16984 Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
16990 S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
16992 /* Appends to 'sv' a displayable version of the innards of the bracketed
16993 * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
16994 * output anything, and bitmap_invlist, if not NULL, will point to an
16995 * inversion list of what is in the bit map */
16999 unsigned int punct_count = 0;
17000 SV* invlist = NULL;
17001 SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
17002 bool allow_literals = TRUE;
17004 PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
17006 invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
17008 /* Worst case is exactly every-other code point is in the list */
17009 *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
17011 /* Convert the bit map to an inversion list, keeping track of how many
17012 * ASCII puncts are set, including an extra amount for the backslashed
17014 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
17015 if (BITMAP_TEST(bitmap, i)) {
17016 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
17017 if (isPUNCT_A(i)) {
17019 if isBACKSLASHED_PUNCT(i) {
17026 /* Nothing to output */
17027 if (_invlist_len(*invlist_ptr) == 0) {
17028 SvREFCNT_dec(invlist);
17032 /* Generally, it is more readable if printable characters are output as
17033 * literals, but if a range (nearly) spans all of them, it's best to output
17034 * it as a single range. This code will use a single range if all but 2
17035 * printables are in it */
17036 invlist_iterinit(*invlist_ptr);
17037 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17039 /* If range starts beyond final printable, it doesn't have any in it */
17040 if (start > MAX_PRINT_A) {
17044 /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
17045 * all but two, the range must start and end no later than 2 from
17047 if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
17048 if (end > MAX_PRINT_A) {
17054 if (end - start >= MAX_PRINT_A - ' ' - 2) {
17055 allow_literals = FALSE;
17060 invlist_iterfinish(*invlist_ptr);
17062 /* The legibility of the output depends mostly on how many punctuation
17063 * characters are output. There are 32 possible ASCII ones, and some have
17064 * an additional backslash, bringing it to currently 36, so if any more
17065 * than 18 are to be output, we can instead output it as its complement,
17066 * yielding fewer puncts, and making it more legible. But give some weight
17067 * to the fact that outputting it as a complement is less legible than a
17068 * straight output, so don't complement unless we are somewhat over the 18
17070 if (allow_literals && punct_count > 22) {
17071 sv_catpvs(sv, "^");
17073 /* Add everything remaining to the list, so when we invert it just
17074 * below, it will be excluded */
17075 _invlist_union_complement_2nd(*invlist_ptr, PL_InBitmap, invlist_ptr);
17076 _invlist_invert(*invlist_ptr);
17079 /* Here we have figured things out. Output each range */
17080 invlist_iterinit(*invlist_ptr);
17081 while (invlist_iternext(*invlist_ptr, &start, &end)) {
17082 if (start >= NUM_ANYOF_CODE_POINTS) {
17085 put_range(sv, start, end, allow_literals);
17087 invlist_iterfinish(*invlist_ptr);
17092 #define CLEAR_OPTSTART \
17093 if (optstart) STMT_START { \
17094 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \
17095 " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
17099 #define DUMPUNTIL(b,e) \
17101 node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
17103 STATIC const regnode *
17104 S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
17105 const regnode *last, const regnode *plast,
17106 SV* sv, I32 indent, U32 depth)
17108 U8 op = PSEUDO; /* Arbitrary non-END op. */
17109 const regnode *next;
17110 const regnode *optstart= NULL;
17112 RXi_GET_DECL(r,ri);
17113 GET_RE_DEBUG_FLAGS_DECL;
17115 PERL_ARGS_ASSERT_DUMPUNTIL;
17117 #ifdef DEBUG_DUMPUNTIL
17118 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
17119 last ? last-start : 0,plast ? plast-start : 0);
17122 if (plast && plast < last)
17125 while (PL_regkind[op] != END && (!last || node < last)) {
17127 /* While that wasn't END last time... */
17130 if (op == CLOSE || op == WHILEM)
17132 next = regnext((regnode *)node);
17135 if (OP(node) == OPTIMIZED) {
17136 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
17143 regprop(r, sv, node, NULL);
17144 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
17145 (int)(2*indent + 1), "", SvPVX_const(sv));
17147 if (OP(node) != OPTIMIZED) {
17148 if (next == NULL) /* Next ptr. */
17149 PerlIO_printf(Perl_debug_log, " (0)");
17150 else if (PL_regkind[(U8)op] == BRANCH
17151 && PL_regkind[OP(next)] != BRANCH )
17152 PerlIO_printf(Perl_debug_log, " (FAIL)");
17154 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
17155 (void)PerlIO_putc(Perl_debug_log, '\n');
17159 if (PL_regkind[(U8)op] == BRANCHJ) {
17162 const regnode *nnode = (OP(next) == LONGJMP
17163 ? regnext((regnode *)next)
17165 if (last && nnode > last)
17167 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
17170 else if (PL_regkind[(U8)op] == BRANCH) {
17172 DUMPUNTIL(NEXTOPER(node), next);
17174 else if ( PL_regkind[(U8)op] == TRIE ) {
17175 const regnode *this_trie = node;
17176 const char op = OP(node);
17177 const U32 n = ARG(node);
17178 const reg_ac_data * const ac = op>=AHOCORASICK ?
17179 (reg_ac_data *)ri->data->data[n] :
17181 const reg_trie_data * const trie =
17182 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
17184 AV *const trie_words
17185 = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
17187 const regnode *nextbranch= NULL;
17190 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
17191 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
17193 PerlIO_printf(Perl_debug_log, "%*s%s ",
17194 (int)(2*(indent+3)), "",
17196 ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
17197 SvCUR(*elem_ptr), 60,
17198 PL_colors[0], PL_colors[1],
17200 ? PERL_PV_ESCAPE_UNI
17202 | PERL_PV_PRETTY_ELLIPSES
17203 | PERL_PV_PRETTY_LTGT
17208 U16 dist= trie->jump[word_idx+1];
17209 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
17210 (UV)((dist ? this_trie + dist : next) - start));
17213 nextbranch= this_trie + trie->jump[0];
17214 DUMPUNTIL(this_trie + dist, nextbranch);
17216 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
17217 nextbranch= regnext((regnode *)nextbranch);
17219 PerlIO_printf(Perl_debug_log, "\n");
17222 if (last && next > last)
17227 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
17228 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
17229 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
17231 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
17233 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
17235 else if ( op == PLUS || op == STAR) {
17236 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
17238 else if (PL_regkind[(U8)op] == ANYOF) {
17239 /* arglen 1 + class block */
17240 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL)
17241 ? ANYOF_POSIXL_SKIP
17243 node = NEXTOPER(node);
17245 else if (PL_regkind[(U8)op] == EXACT) {
17246 /* Literal string, where present. */
17247 node += NODE_SZ_STR(node) - 1;
17248 node = NEXTOPER(node);
17251 node = NEXTOPER(node);
17252 node += regarglen[(U8)op];
17254 if (op == CURLYX || op == OPEN)
17258 #ifdef DEBUG_DUMPUNTIL
17259 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
17264 #endif /* DEBUGGING */
17268 * c-indentation-style: bsd
17269 * c-basic-offset: 4
17270 * indent-tabs-mode: nil
17273 * ex: set ts=8 sts=4 sw=4 et: